Merge commit for internal changes
diff --git a/RELEASE.md b/RELEASE.md
index e04bd3f..97c1a8c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,67 @@
+# Release 1.5.0
+
+## Breaking Changes
+* Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
+* Our Linux binaries are built using ubuntu 16 containers, potentially
+  introducing glibc incompatibility issues with ubuntu 14.
+* Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
+  This may break TF on older CPUs.
+
+## Major Features And Improvements
+* [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
+  preview version is now available.
+* [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
+  dev preview is now available.
+* CUDA 9 and cuDNN 7 support.
+
+## Bug Fixes and Other Changes
+* `auto_correlation` added to `tf.contrib.distributions`.
+* Add `DenseFlipout` probabilistic layer.
+* Restandardize `DenseVariational` as simpler template for other probabilistic layers.
+* Make `tf.contrib.distributions` QuadratureCompound classes support batch.
+* `Stream::BlockHostUntilDone` now returns Status rather than bool.
+* Customize request timeouts for the GCS filesystem.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Abdullah Alrasheed, abenmao, Adam Salvail, Aditya Dhulipala, Ag Ramesh,
+Akimasa Kimura, Alan Du, Alan Yee, Alexander, Amit Kushwaha, Amy, Andrei Costinescu,
+Andrei Nigmatulin, Andrew Erlichson, Andrew Myers, Andrew Stepanov, Androbin, AngryPowman,
+Anish Shah, Anton Daitche, Artsiom Chapialiou, asdf2014, Aseem Raj Baranwal, Ash Hall,
+Bart Kiers, Batchu Venkat Vishal, ben, Ben Barsdell, Bill Piel, Carl Thomé, Catalin Voss,
+Changming Sun, Chengzhi Chen, Chi Zeng, Chris Antaki, Chris Donahue, Chris Oelmueller,
+Chris Tava, Clayne Robison, Codrut, Courtial Florian, Dalmo Cirne, Dan J, Darren Garvey,
+David Kristoffersson, David Norman, David RöThlisberger, DavidNorman, Dhruv, DimanNe,
+Dorokhov, Duncan Mac-Vicar P, EdwardDixon, EMCP, error.d, FAIJUL, Fan Xia,
+Francois Xavier, Fred Reiss, Freedom" Koan-Sin Tan, Fritz Obermeyer, Gao, Xiang,
+Guenther Schmuelling, Guo Yejun (郭叶军), Hans Gaiser, HectorSVC, Hyungsuk Yoon,
+James Pruegsanusak, Jay Young, Jean Wanka, Jeff Carpenter, Jeremy Rutman, Jeroen BéDorf,
+Jett Jones, Jimmy Jia, jinghuangintel, jinze1994, JKurland, Joel Hestness, joetoth,
+John B Nelson, John Impallomeni, John Lawson, Jonas, Jonathan Dekhtiar, joshkyh, Jun Luan,
+Jun Mei, Kai Sasaki, Karl Lessard, karl@kubx.ca, Kb Sriram, Kenichi Ueno, Kevin Slagle,
+Kongsea, Lakshay Garg, lhlmgr, Lin Min, liu.guangcong, Loki Der Quaeler, Louie Helm,
+lucasmoura, Luke Iwanski, Lyndon White, Mahmoud Abuzaina, Marcel Puyat, Mark Aaron Shirley,
+Michele Colombo, MtDersvan, Namrata-Ibm, Nathan Luehr, Naurril, Nayana Thorat, Nicolas Lopez,
+Niranjan Hasabnis, Nolan Liu, Nouce, Oliver Hennigh, osdamv, Patrik Erdes,
+Patryk Chrabaszcz, Pavel Christof, Penghao Cen, postBG, Qingqing Cao, Qingying Chen, qjivy,
+Raphael, Rasmi, raymondxyang, Renze Yu, resec, Roffel, Ruben Vereecken, Ryohei Kuroki,
+sandipmgiri, Santiago Castro, Scott Kirkland, Sean Vig, Sebastian Raschka, Sebastian Weiss,
+Sergey Kolesnikov, Sergii Khomenko, Shahid, Shivam Kotwalia, Stuart Berg, Sumit Gouthaman,
+superzerg, Sven Mayer, tetris, Ti Zhou, Tiago Freitas Pereira, Tian Jin, Tomoaki Oiki,
+Vaibhav Sood, vfdev, Vivek Rane, Vladimir Moskva, wangqr, Weber Xie, Will Frey,
+Yan Facai (颜发才), yanivbl6, Yaroslav Bulatov, Yixing Lao, Yong Tang, youkaichao,
+Yuan (Terry) Tang, Yue Zhang, Yuxin Wu, Ziming Dong, ZxYuan, 黄璞
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+# Release 1.4.1
+
+## Bug Fixes and Other Changes
+* `LinearClassifier` fix for CloudML Engine.
+
 # Release 1.4.0
 
 ## Major Features And Improvements
diff --git a/configure.py b/configure.py
index 7537e30..cf16ef4 100644
--- a/configure.py
+++ b/configure.py
@@ -302,6 +302,12 @@
 
   Returns:
     boolean value of the variable.
+
+  Raises:
+    UserInputError: if an environment variable is set, but it cannot be
+      interpreted as a boolean indicator, assume that the user has made a
+      scripting error, and will continue to provide invalid input.
+      Raise the error to avoid infinitely looping.
   """
   if not question:
     question = 'Do you wish to build TensorFlow with %s support?' % query_item
@@ -319,6 +325,23 @@
     question += ' [y/N]: '
 
   var = environ_cp.get(var_name)
+  if var is not None:
+    var_content = var.strip().lower()
+    true_strings = ('1', 't', 'true', 'y', 'yes')
+    false_strings = ('0', 'f', 'false', 'n', 'no')
+    if var_content in true_strings:
+      var = True
+    elif var_content in false_strings:
+      var = False
+    else:
+      raise UserInputError(
+          'Environment variable %s must be set as a boolean indicator.\n'
+          'The following are accepted as TRUE : %s.\n'
+          'The following are accepted as FALSE: %s.\n'
+          'Current value is %s.' % (
+              var_name, ', '.join(true_strings), ', '.join(false_strings),
+              var))
+
   while var is None:
     user_input_origin = get_input(question)
     user_input = user_input_origin.strip().lower()
@@ -605,8 +628,9 @@
 
   Raises:
     UserInputError: if a query has been attempted n_ask_attempts times without
-    success, assume that the user has made a scripting error, and will continue
-    to provide invalid input. Raise the error to avoid infinitely looping.
+      success, assume that the user has made a scripting error, and will
+      continue to provide invalid input. Raise the error to avoid infinitely
+      looping.
   """
   default = environ_cp.get(var_name) or var_default
   full_query = '%s [Default is %s]: ' % (
@@ -1101,11 +1125,13 @@
 
 def set_trisycl_include_dir(environ_cp):
   """Set TRISYCL_INCLUDE_DIR."""
+
   ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
                              'include directory. (Use --config=sycl_trisycl '
                              'when building with Bazel) '
                              '[Default is %s]: '
                             ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+
   while True:
     trisycl_include_dir = get_from_env_or_user_or_default(
         environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 06c9c2b..808bd0c 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -386,6 +386,14 @@
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python"],
+)
+
 filegroup(
     name = "all_opensource_files",
     data = [
@@ -653,6 +661,9 @@
         "//tensorflow/tools/quantization:all_files",
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
+        "//third_party/eigen3:all_files",
+        "//third_party/fft2d:all_files",
+        "//third_party/flatbuffers:all_files",
         "//third_party/hadoop:all_files",
         "//third_party/sycl:all_files",
         "//third_party/sycl/sycl:all_files",
@@ -791,11 +802,3 @@
         "tf_exported_symbols.lds",
     ],
 )
-
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index c8afb7d..7652b49 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -90,8 +90,6 @@
         ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c22f83a..3fe7e51 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -117,7 +117,6 @@
 
 py_library(
     name = "dataset_serialization_test",
-    testonly = 1,
     srcs = [
         "dataset_serialization_test_base.py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 7bc5007..f4b7d67 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -41,7 +41,7 @@
   For those familiar with TensorFlow graphs, notice the absence of
   `tf.Session`. The `forward()` method here immediately executes and
   returns output values. The `loss()` method immediately compares the
-  output of `forward()` with the target adn returns the MSE loss value.
+  output of `forward()` with the target and returns the MSE loss value.
   The `fit()` performs gradient-descent training on the model's weights
   and bias.
   """
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index bb121c7..82b3d39 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -40,7 +40,7 @@
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
diff --git a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
index 616240f..720c74e 100644
--- a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@@ -46,34 +46,47 @@
     std::vector<T> out_values;
     std::vector<std::pair<int64, int64>> out_indices;
     for (int i = 0; i < input_flat.size(); ++i) {
-      std::vector<string> entries =
-          str_util::Split(input_flat(i), " ", str_util::SkipEmpty());
-      OP_REQUIRES(ctx, !entries.empty(),
-                  errors::InvalidArgument("No entries found for input[", i,
+      StringPiece line(input_flat(i));
+      str_util::RemoveWhitespaceContext(&line);
+
+      StringPiece piece;
+      OP_REQUIRES(ctx, str_util::ConsumeNonWhitespace(&line, &piece),
+                  errors::InvalidArgument("No label found for input[", i,
                                           "]: \"", input_flat(i), "\""));
+
       Tlabel label_value;
-      OP_REQUIRES(
-          ctx, strings::SafeStringToNumeric<Tlabel>(entries[0], &label_value),
-          errors::InvalidArgument("Label format incorrect: ", entries[0]));
+      OP_REQUIRES(ctx,
+                  strings::SafeStringToNumeric<Tlabel>(piece, &label_value),
+                  errors::InvalidArgument("Label format incorrect: ", piece));
+
       label(i) = label_value;
-      for (int j = 1; j < entries.size(); j++) {
-        std::vector<string> pair = str_util::Split(entries[j], ":");
-        OP_REQUIRES(
-            ctx, (pair.size() == 2),
-            errors::InvalidArgument("Invalid feature \"", entries[j], "\""));
+
+      str_util::RemoveLeadingWhitespace(&line);
+      while (str_util::ConsumeNonWhitespace(&line, &piece)) {
+        size_t p = piece.find(':');
+        OP_REQUIRES(ctx, (p != StringPiece::npos),
+                    errors::InvalidArgument("Invalid feature \"", piece, "\""));
+
         int64 feature_index;
         OP_REQUIRES(
-            ctx, strings::safe_strto64(pair[0].c_str(), &feature_index),
-            errors::InvalidArgument("Feature format incorrect: ", entries[j]));
+            ctx, strings::safe_strto64(piece.substr(0, p), &feature_index),
+            errors::InvalidArgument("Feature format incorrect: ", piece));
         OP_REQUIRES(ctx, (feature_index >= 0),
                     errors::InvalidArgument(
                         "Feature index should be >= 0, got ", feature_index));
+
         T feature_value;
         OP_REQUIRES(
-            ctx, strings::SafeStringToNumeric<T>(pair[1], &feature_value),
-            errors::InvalidArgument("Feature format incorrect: ", entries[j]));
+
+            ctx,
+            strings::SafeStringToNumeric<T>(piece.substr(p + 1),
+                                            &feature_value),
+            errors::InvalidArgument("Feature format incorrect: ", piece));
+
         out_values.emplace_back(feature_value);
         out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
+
+        str_util::RemoveLeadingWhitespace(&line);
       }
     }
 
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 9345303..0613de2 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -262,6 +262,14 @@
 
 #### Optimization
 
+The `build_all_ios.sh` script can take optional command-line arguments to
+selectively register only for the operators used in your graph.
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -g $HOME/graphs/inception/tensorflow_inception_graph.pb
+```
+Please note this is an aggresive optimization of the operators and the resulting library may not work with other graphs but will reduce the size of the final library.
+
 The `compile_ios_tensorflow.sh` script can take optional command-line arguments.
 The first argument will be passed as a C++ optimization flag and defaults to
 debug mode. If you are concerned about performance or are working on a release
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 988e12b..a18df25 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -26,13 +26,16 @@
 usage() {
   echo "Usage: $(basename "$0") [-a:T]"
   echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
+  echo "-g [graph] optimize and selectively register ops only for this graph"
   echo "-T only build tensorflow (dont download other deps etc)"
   exit 1
 }
 
-while getopts "a:T" opt_name; do
+DEFAULT_ARCH="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:g:T" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCH="${OPTARG}";;
+    g) OPTIMIZE_FOR_GRAPH="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
     *) usage;;
   esac
@@ -42,7 +45,8 @@
 
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd ${SCRIPT_DIR}/../../../
+TOP_SRCDIR="${SCRIPT_DIR}/../../../"
+cd ${TOP_SRCDIR}
 
 source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
@@ -56,6 +60,32 @@
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
+PRNT_SLCTV_BIN="${TOP_SRCDIR}bazel-bin/tensorflow/python/tools/print_selective_registration_header"
+
+if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
+    echo "Request to optimize for graph: ${OPTIMIZE_FOR_GRAPH}"
+    #Request to trim the OPs by selectively registering
+    if [ ! -f ${PRNT_SLCTV_BIN} ]; then
+        #Build bazel build tensorflow/python/tools:print_selective_registration_header
+        echo "${PRNT_SLCTV_BIN} not found. Trying to build it"
+        cd ${TOP_SRCDIR}
+        bazel build --copt="-DUSE_GEMM_FOR_CONV" tensorflow/python/tools:print_selective_registration_header
+         if [ ! -f ${PRNT_SLCTV_BIN} ]; then
+            echo "Building print_selective_registration_header failed"
+            echo "You may want to build TensorFlow with: "
+            echo "./configure"
+            echo "bazel build --copt="-DUSE_GEMM_FOR_CONV" tensorflow/python/tools:print_selective_registration_header"
+            echo "and then run this script again"
+            exit 1
+        fi
+    else
+        echo "${PRNT_SLCTV_BIN} found. Using it"
+        ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
+
+    fi
+
+fi
+
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
     # Remove any old files first.
     make -f tensorflow/contrib/makefile/Makefile clean
@@ -64,8 +94,13 @@
     # Pull down the required versions of the frameworks we need.
     tensorflow/contrib/makefile/download_dependencies.sh
 
-    # Compile protobuf for the target iOS device architectures.
-    tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    if [[ -z "${BUILD_ARCH}" ]]; then
+        # Compile protobuf for the target iOS device architectures.
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH}
+    else
+        # Compile protobuf for the target iOS device architectures.
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
+    fi
 fi
 
 # Compile nsync for the target iOS device architectures.
@@ -80,13 +115,24 @@
 fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
-if [[ -z "${BUILD_ARCH}" ]]; then
-    # build the ios tensorflow libraries.
-    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
-else
+TF_CC_FLAGS="-O3"
+TF_SCRIPT_FLAGS="-h ${HOST_NSYNC_LIB} -n ${TARGET_NSYNC_LIB}"
+
+if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
     # arch specified so build just that
-    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+    TF_CC_FLAGS="${TF_CC_FLAGS} -DANDROID_TYPES=__ANDROID_TYPES_FULL__ -DSELECTIVE_REGISTRATION -DSUPPORT_SELECTIVE_REGISTRATION"
+    # The Makefile checks the env var to decide which ANDROID_TYPES to build
+    export ANDROID_TYPES="-D__ANDROID_TYPES_FULL__"
 fi
 
+if [[ ! -z "${BUILD_ARCH}" ]]; then
+    # arch specified so build just that
+    TF_SCRIPT_FLAGS="${TF_SCRIPT_FLAGS} -a ${BUILD_ARCH}"
+fi
+
+# build the ios tensorflow libraries.
+echo "Building TensorFlow with flags: ${TF_SCRIPT_FLAGS} -f ${TF_CC_FLAGS}"
+tensorflow/contrib/makefile/compile_ios_tensorflow.sh ${TF_SCRIPT_FLAGS} -f "${TF_CC_FLAGS}"
+
 # Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
deleted file mode 100644
index a051ab0..0000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ /dev/null
@@ -1,1236 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include <queue>
-#include <thread>
-#include <unordered_map>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/mutex.h"
-
-#define EIGEN_USE_THREADS
-
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#include "tensorflow/stream_executor/stream.h"
-#endif
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-/*
- * MPI Allreduce and Allgather Ops for TensorFlow.
- *
- * TensorFlow natively provides inter-device communication through send and
- * receive ops and inter-node communication through Distributed TensorFlow,
- * based on the same send and receive abstractions. These end up being
- * insufficient for synchronous data-parallel training on HPC clusters where
- * Infiniband or other high-speed interconnects are available.  This module
- * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
- * gathers and reductions and can take advantage of hardware-optimized
- * communication libraries through the MPI implementation.
- *
- * The primary logic of the allreduce and allgather are in RingAllgather() and
- * RingAllreduce(). The background thread which facilitates MPI operations is
- * run in BackgroundThreadLoop(). The provided MPI ops are:
- *      – MPIInit:
- *          Initialize MPI on a given device (CPU or GPU).
- *          Should only be run on a single device in every process.
- *      – MPISize:
- *          Get the number of MPI processes in the global communicator.
- *      – MPIRank:
- *          Get the rank of the current MPI process in the global communicator.
- *      – MPILocalRank:
- *          Get the local rank of the current MPI process within its node.
- *      – MPIAllreduce:
- *          Perform an allreduce on a Tensor, returning the sum
- *          across all MPI processes in the global communicator.
- *      – MPIAllgather:
- *          Perform an allgather on a Tensor, returning the concatenation of
- *          the tensor on the first dimension across all MPI processes in the
- *          global communicator.
- *
- */
-
-template <class T>
-using StatusOr = perftools::gputools::port::StatusOr<T>;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-// Make sure template specializations are generated in the ring.cu.cc and the
-// ring.cc file, not in this file.
-extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<GPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<GPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*, Tensor*,
-                                                     Tensor*);
-extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                           const Tensor*,
-                                                           Tensor*, Tensor*);
-extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
-                                                       const Tensor*, Tensor*,
-                                                       Tensor*);
-extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
-                                                     const Tensor*,
-                                                     const std::vector<size_t>&,
-                                                     Tensor*);
-extern template Status RingAllgather<CPUDevice, long long>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-extern template Status RingAllgather<CPUDevice, float>(
-    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
-
-namespace {
-
-// Return true if the templated type is GPUDevice, otherwise false.
-template <typename T>
-bool IsGPUDevice();
-template <>
-bool IsGPUDevice<GPUDevice>() {
-  return true;
-};
-template <>
-bool IsGPUDevice<CPUDevice>() {
-  return false;
-};
-
-// A callback to call after the MPI communication completes. Since the
-// allreduce and allgather ops are asynchronous, this callback is what resumes
-// computation after the reduction is completed.
-typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
-
-struct CollectiveOpRecord {
-  // The rank performing this piece of the op
-  int rank;
-
-  // The name of the op/tensor to be reduced
-  std::string name;
-
-  // The op's kernel context
-  OpKernelContext* context;
-
-  // Data type of the op
-  DataType dtype;
-
-  // The input tensor
-  const Tensor* in_t;
-
-  // Allgather: Vector of per-rank first-dimension sizes
-  std::vector<size_t> sizes_vec;
-
-  // The temp tensor for intermediate results
-  Tensor temp_t;
-
-  // The output tensor
-  Tensor* out_t;
-
-  // Whether to run this op on the gpu
-  bool on_gpu;
-
-  // The callback to call after the op has completed
-  CommunicationDoneCallback callback;
-};
-
-// Table storing Tensors to be reduced, keyed by unique name.
-// This table contains everything necessary to do the reduction
-typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
-
-// Table for storing Tensor metadata on rank zero. This is used for error
-// checking and size calculations, as well as determining when a reduction is
-// ready to be done (when all nodes are ready to do it).
-typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
-
-// The global state required for the MPI ops.
-//
-// MPI is a library that stores a lot of global per-program state and often
-// requires running on a single thread. As a result, we have to have a single
-// background thread responsible for all MPI operations, and communicate with
-// that background thread through global state.
-struct MPIGlobalState {
-  // An atomic boolean which is set to true when MPI is initialized.
-  // This ensures that MPI_Init is never called twice.
-  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
-
-  // Condition variable to wait for initialization
-  condition_variable cv;
-
-  // Whether MPI_Init has been completed on the background thread.
-  bool initialization_done = false;
-
-  // Whether MPI_Init succeeded on the background thread.
-  Status init_status;
-
-  // A mutex that needs to be used whenever MPI operations touch
-  // shared structures.
-  mutex mu;
-
-  // Tensors waiting to be allreduced or allgathered.
-  TensorTable tensor_table;
-
-  // Queue of MPI requests waiting to be sent to the coordinator node.
-  std::queue<MPIRequest> message_queue;
-
-  // Background thread running MPI communication.
-  std::thread background_thread;
-
-  // Whether the background thread should shutdown.
-  bool shut_down = false;
-
-  // Only exists on the coordinator node (rank zero). Maintains a count of
-  // how many nodes are ready to allreduce every tensor (keyed by tensor
-  // name).
-  std::unique_ptr<MessageTable> message_table;
-
-  // The MPI rank, local rank, and size.
-  int rank = 0;
-  int local_rank = 0;
-  int size = 1;
-
-  // The device that MPI was initialized on. (-1 for no GPU)
-  int device = -1;
-
-  // The CUDA stream used for data transfers and within-allreduce operations.
-  // A naive implementation would use the TensorFlow StreamExecutor CUDA
-  // stream. However, the allreduce and allgather require doing memory copies
-  // and kernel executions (for accumulation of values on the GPU). However,
-  // the subsequent operations must wait for those operations to complete,
-  // otherwise MPI (which uses its own stream internally) will begin the data
-  // transfers before the CUDA calls are complete. In order to wait for those
-  // CUDA operations, if we were using the TensorFlow stream, we would have
-  // to synchronize that stream; however, other TensorFlow threads may be
-  // submitting more work to that stream, so synchronizing on it can cause
-  // the allreduce to be delayed, waiting for compute totally unrelated to it
-  // in other parts of the graph. Overlaying memory transfers and compute
-  // during backpropagation is crucial for good performance, so we cannot use
-  // the TensorFlow stream, and must use our own stream.
-#if GOOGLE_CUDA
-  cudaStream_t stream;
-  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
-#endif
-
-  ~MPIGlobalState() {
-    // Make sure that the destructor of the background thread is safe to
-    // call. If a thread is still joinable (not detached or complete) its
-    // destructor cannot be called.
-    if (background_thread.joinable()) {
-      shut_down = true;
-      background_thread.join();
-    }
-  }
-};
-
-// All the MPI state that must be stored globally per-process.
-static MPIGlobalState mpi_global;
-
-// For clarify in argument lists.
-#define RANK_ZERO 0
-
-// A tag used for all coordinator messaging.
-#define TAG_NOTIFY 1
-
-// Store the MPIRequest for a name, and return whether the total count of
-// MPIRequests for that tensor is now equal to the MPI size (and thus we are
-// ready to reduce the tensor).
-bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
-                          MPIRequest msg, int mpi_size) {
-  auto name = msg.tensor_name();
-  auto table_iter = message_table->find(name);
-  if (table_iter == message_table->end()) {
-    message_table->emplace(name, std::vector<MPIRequest>({msg}));
-    table_iter = message_table->find(name);
-  } else {
-    table_iter->second.push_back(msg);
-  }
-
-  int count = table_iter->second.size();
-  return count == mpi_size;
-}
-
-// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
-// instructing all ranks to start the reduction to all ranks. The MPIResponse
-// also contains error messages in case the submitted MPIRequests were not
-// valid (for example, contained mismatched shapes or types).
-//
-// Constructing the MPIResponse, thus, requires a whole lot of error checking.
-MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
-                                 std::string name) {
-  bool error = false;
-  auto it = message_table->find(name);
-  assert(it != message_table->end());
-
-  std::vector<MPIRequest> requests = it->second;
-  assert(requests.size() > 0);
-
-  std::ostringstream error_message_stream;
-
-  // Check that all data types being reduced or gathered are identical
-  auto data_type = requests[0].tensor_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    auto request_type = requests[i].tensor_type();
-    if (data_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched data types: One rank had type "
-                           << DataType_Name(data_type)
-                           << ", but another rank had type "
-                           << DataType_Name(request_type) << ".";
-      break;
-    }
-  }
-
-  // Check that all requested operations are the same
-  auto message_type = requests[0].request_type();
-  for (unsigned int i = 1; i < requests.size(); i++) {
-    if (error) {
-      break;
-    }
-
-    auto request_type = requests[i].request_type();
-    if (message_type != request_type) {
-      error = true;
-      error_message_stream << "Mismatched MPI operations: One rank did an "
-                           << message_type << ", but another rank did an "
-                           << request_type << ".";
-      break;
-    }
-  }
-
-  // If we are doing an allreduce, check that all tensor shapes
-  // are identical
-  if (message_type == MPIRequest::ALLREDUCE) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape != request_shape) {
-        error = true;
-        error_message_stream << "Mismatched allreduce tensor shapes: "
-                             << "One rank reduced a tensor of shape "
-                             << tensor_shape.DebugString()
-                             << ", but another rank sent a tensor of shape "
-                             << request_shape.DebugString() << ".";
-        break;
-      }
-    }
-  }
-
-  // If we are doing an allgather, make sure all but the first dimension are
-  // the same. The first dimension may be different and the output tensor is
-  // the sum of the first dimension. Collect the sizes by rank.
-  if (message_type == MPIRequest::ALLGATHER) {
-    TensorShape tensor_shape = requests[0].tensor_shape();
-
-    if (tensor_shape.dims() == 0) {
-      error = true;
-      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
-    }
-
-    for (unsigned int i = 1; i < requests.size(); i++) {
-      if (error) {
-        break;
-      }
-
-      TensorShape request_shape = requests[i].tensor_shape();
-      if (tensor_shape.dims() != request_shape.dims()) {
-        error = true;
-        error_message_stream << "Mismatched allgather tensor shapes: "
-                             << "One rank gathered a tensor of rank "
-                             << tensor_shape.dims()
-                             << ", but another rank sent a tensor of rank "
-                             << request_shape.dims() << ".";
-        break;
-      }
-
-      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
-        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
-          error = true;
-          error_message_stream
-              << "Mismatched allgather tensor shapes: "
-              << "One rank gathered a tensor with dimension " << dim
-              << " equal to " << tensor_shape.dim_size(dim)
-              << ", but another rank sent a tensor with dimension " << dim
-              << " equal to " << request_shape.dim_size(dim) << ".";
-          break;
-        }
-      }
-    }
-  }
-
-  MPIResponse response;
-  response.set_tensor_name(name);
-  if (error) {
-    std::string error_message = error_message_stream.str();
-    response.set_response_type(MPIResponse::ERROR);
-    response.set_error_message(error_message);
-  } else {
-    auto response_type = MPIResponse::ERROR;
-    if (message_type == MPIRequest::ALLREDUCE) {
-      response_type = MPIResponse::ALLREDUCE;
-    } else {
-      response_type = MPIResponse::ALLGATHER;
-    }
-    response.set_response_type(response_type);
-  }
-
-  // Clear all queued up requests for this name. They are now taken care of
-  // by the constructed MPI response.
-  message_table->erase(it);
-
-  return response;
-}
-
-// Process an MPIResponse by doing a reduction, a gather, or raising an error.
-void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
-  OpKernelContext* context;
-  const Tensor* input_tensor;
-  std::vector<size_t> sizes_vec;
-  Tensor temp_tensor;
-  Tensor* output_tensor;
-  CommunicationDoneCallback callback;
-  bool on_gpu;
-  {
-    // Lock on the tensor table.
-    mutex_lock guard(mpi_global.mu);
-
-    // We should never fail at finding this key in the tensor table.
-    auto name = response.tensor_name();
-    auto iter = tensor_table.find(name);
-    assert(iter != tensor_table.end());
-
-    assert(response.response_type() == MPIResponse::ALLREDUCE ||
-           response.response_type() == MPIResponse::ALLGATHER ||
-           response.response_type() == MPIResponse::ERROR);
-
-    CollectiveOpRecord record = iter->second;
-    context = record.context;
-    input_tensor = record.in_t;
-    sizes_vec = record.sizes_vec;
-    temp_tensor = record.temp_t;
-    output_tensor = record.out_t;
-    on_gpu = record.on_gpu;
-    callback = record.callback;
-
-    // Clear the tensor table of this tensor and its callbacks; the rest of
-    // this function takes care of it.
-    tensor_table.erase(iter);
-  }
-
-  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
-  // link to non-existent symbols.
-#if GOOGLE_CUDA
-#define GPU_DEVICE_IF_CUDA GPUDevice
-#else
-#define GPU_DEVICE_IF_CUDA CPUDevice
-#endif
-
-  Status status;
-  auto dtype = input_tensor->dtype();
-  if (response.response_type() == MPIResponse::ALLGATHER) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, float>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, int>(context, input_tensor,
-                                                      sizes_vec, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, sizes_vec, output_tensor)
-                      : RingAllgather<CPUDevice, long long>(
-                            context, input_tensor, sizes_vec, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allgather.");
-    }
-  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
-    if (dtype == DT_FLOAT) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, float>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT32) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, int>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else if (dtype == DT_INT64) {
-      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor)
-                      : RingAllreduce<CPUDevice, long long>(
-                            context, input_tensor, &temp_tensor, output_tensor);
-    } else {
-      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
-    }
-  } else if (response.response_type() == MPIResponse::ERROR) {
-    status = errors::FailedPrecondition(response.error_message());
-  }
-
-  if (status.ok()) {
-    callback(StatusOr<Tensor>(*output_tensor));
-  } else {
-    callback(StatusOr<Tensor>(status));
-  }
-}
-
-// The MPI background thread loop coordinates all the MPI processes and the
-// tensor reductions. The design of the communicator mechanism is limited by a
-// few considerations:
-//
-//      1. Some MPI implementations require all MPI calls to happen from a
-//      single thread. Since TensorFlow may use several threads for graph
-//      processing, this means we must have our own dedicated thread for
-//      dealing with MPI.
-//      2. We want to gracefully handle errors, when MPI processes do not
-//      properly agree upon what should happen (such as mismatched types or
-//      shapes). To do so requires the MPI processes to know about the shapes
-//      and types of the relevant tensors on the other processes.
-//      3. The MPI reductions and gathers should be able to happen in parallel
-//      with other ongoing operations. Since MPI uses an internal
-//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
-//      cannot explicitly synchronize memcpys or kernels with it. As a result,
-//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
-//      ordering of memcpys and kernels with respect to TF streams.
-//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
-//      tensors in the same order. Thus, there must be a way to ensure the
-//      reduction memcpys and kernels occur for correct tensors across all
-//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
-//      gather and trigger the reduction operations that are ready to execute.
-//
-// The coordinator currently follows a master-worker paradigm. Rank zero acts
-// as the master (the "coordinator"), whereas all other ranks are simply
-// workers. Each rank runs its own background thread which progresses in ticks.
-// In each tick, the following actions happen:
-//
-//      a) The workers send any available MPIRequests to the coordinator. These
-//      MPIRequests indicate what the worker would like to do (i.e. which
-//      tensor they would like to gather or reduce, as well as their shape and
-//      type). They repeat this for every tensor that they would like to
-//      operate on after that tensor's collective op has executed ComputeAsync.
-//
-//      b) The workers send an empty "DONE" message to the coordinator to
-//      indicate that there are no more tensors they wish to operate on.
-//
-//      c) The coordinator receives the MPIRequests from the workers, as well
-//      as from its own TensorFlow ops, and stores them in a request table. The
-//      coordinator continues to receive MPIRequest messages until it has
-//      received MPI_SIZE number of empty "DONE" messages.
-//
-//      d) The coordinator finds all tensors that are ready to be reduced,
-//      gathered, or all operations that result in an error. For each of those,
-//      it sends an MPIResponse to all the workers. When no more MPIResponses
-//      are available, it sends a "DONE" response to the workers. If the
-//      process is being shutdown, it instead sends a "SHUTDOWN" response.
-//
-//      e) The workers listen for MPIResponse messages, processing each one by
-//      doing the required reduce or gather, until they receive a "DONE"
-//      response from the coordinator. At that point, the tick ends.
-//      If instead of "DONE" they receive "SHUTDOWN", they exit their
-//      background loop.
-// TODO: Use the global mpi_global state variable instead of a local one
-void BackgroundThreadLoop() {
-#if GOOGLE_CUDA
-  // Set the device, so that this thread uses the same GPU context as the
-  // calling thread.
-  // TODO: Ensure that this is operating correctly. The background thread
-  // needs to be able to control all GPUs that the rank has access to, and
-  // might be more than 1 GPU. Tensors could be resident in any of the
-  // GPUs, so the background thread's accumulate and copy kernels might need
-  // to correctly set the device and it might be necessary for the background
-  // thread to manage multiple streams.
-  cudaSetDevice(mpi_global.device);
-  cudaStreamCreate(&mpi_global.stream);
-#endif
-
-  // Initialize MPI. This must happen on the background thread, since not all
-  // MPI implementations support being called from multiple threads.
-  auto init_result = MPI_Init(NULL, NULL);
-  if (init_result != MPI_SUCCESS) {
-    mpi_global.init_status =
-        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
-    mpi_global.initialization_done = true;
-    mpi_global.cv.notify_all();
-    return;
-  } else {
-    mpi_global.init_status = Status::OK();
-  }
-
-  // Get MPI rank to determine if we are rank zero.
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  bool is_coordinator = rank == 0;
-
-  // Get MPI size to determine how many tensors to wait for before reducing.
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-  // Determine local rank by querying the local communicator.
-  MPI_Comm local_comm;
-  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
-                      &local_comm);
-  int local_rank;
-  MPI_Comm_rank(local_comm, &local_rank);
-
-  mpi_global.rank = rank;
-  mpi_global.local_rank = local_rank;
-  mpi_global.size = size;
-  mpi_global.initialization_done = true;
-
-  // Notify calling thread that initialization is complete
-  mpi_global.cv.notify_all();
-
-  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
-  // Initialize the tensor count table. No tensors are available yet.
-  if (is_coordinator) {
-    mpi_global.message_table =
-        std::unique_ptr<MessageTable>(new MessageTable());
-  }
-
-  // The coordinator sends a SHUTDOWN message to trigger shutdown.
-  bool should_shut_down = false;
-  do {
-    // TODO: Eliminate the need for thread sleep by making all activity
-    // depend on other activity (e.g. condition or MPI waits).
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-
-    // Copy the data structures from global state under this lock.
-    // However, don't keep the lock for the rest of the loop, so that
-    // enqueued stream callbacks can continue.
-    std::queue<MPIRequest> message_queue;
-    {
-      mutex_lock guard(mpi_global.mu);
-      while (!mpi_global.message_queue.empty()) {
-        MPIRequest message = mpi_global.message_queue.front();
-        mpi_global.message_queue.pop();
-        message_queue.push(message);
-      }
-    }
-
-    // Collect all tensors that are ready to be reduced. Record them in the
-    // tensor count table (rank zero) or send them to rank zero to be
-    // recorded (everyone else).
-    std::vector<std::string> ready_to_reduce;
-    while (!message_queue.empty()) {
-      // Pop the first available message message
-      MPIRequest message = message_queue.front();
-      message_queue.pop();
-
-      if (is_coordinator) {
-        bool reduce =
-            IncrementTensorCount(mpi_global.message_table, message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(message.tensor_name());
-        }
-      } else {
-        std::string encoded_message;
-        message.SerializeToString(&encoded_message);
-        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
-                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    }
-
-    // Rank zero has put all its own tensors in the tensor count table.
-    // Now, it should count all the tensors that are coming from other
-    // ranks at this tick. It should keep getting tensors until it gets a
-    // DONE message from all the other ranks.
-    if (is_coordinator) {
-      // Count of DONE messages. Keep receiving messages until the number
-      // of messages is equal to the number of processes. Initialize to
-      // one since the coordinator is effectively done.
-      int completed_ranks = 1;
-      while (completed_ranks != size) {
-        MPI_Status status;
-        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int source_rank = status.MPI_SOURCE;
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // If the length is zero, this is a DONE message.
-        if (msg_length == 0) {
-          completed_ranks++;
-          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
-                   &status);
-          continue;
-        }
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
-                 MPI_COMM_WORLD, &status);
-        std::string received_data(buffer);
-        delete[] buffer;
-
-        MPIRequest received_message;
-        received_message.ParseFromString(received_data);
-        auto received_name = received_message.tensor_name();
-
-        bool reduce = IncrementTensorCount(mpi_global.message_table,
-                                           received_message, size);
-        if (reduce) {
-          ready_to_reduce.push_back(received_name);
-        }
-      }
-
-      // At this point, rank zero should have a fully updated tensor
-      // count table and should know all the tensors that need to be
-      // reduced or gathered, and everyone else should have sent all
-      // their information to rank zero. We can now do reductions and
-      // gathers; rank zero will choose which ones and in what order,
-      // and will notify the other ranks before doing each reduction.
-      for (int i = 0; i < ready_to_reduce.size(); i++) {
-        // Notify all nodes which tensor we'd like to reduce now
-        auto name = ready_to_reduce[i];
-        MPIResponse response =
-            ConstructMPIResponse(mpi_global.message_table, name);
-
-        std::string encoded_response;
-        response.SerializeToString(&encoded_response);
-        for (int r = 1; r < size; r++) {
-          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-        }
-
-        // Perform the reduction. All nodes should end up performing
-        // the same reduction.
-        PerformCollectiveOp(mpi_global.tensor_table, response);
-      }
-
-      // Notify all nodes that we are done with the reductions for this
-      // tick.
-      MPIResponse done_response;
-      should_shut_down = mpi_global.shut_down;
-      done_response.set_response_type(
-          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
-      std::string encoded_response;
-      done_response.SerializeToString(&encoded_response);
-      for (int r = 1; r < size; r++) {
-        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
-                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
-      }
-    } else {
-      // Notify the coordinator that this node is done sending messages.
-      // A DONE message is encoded as a zero-length message.
-      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
-
-      // Receive names for tensors to reduce from rank zero. Once we
-      // receive a empty DONE message, stop waiting for more names.
-      while (true) {
-        MPI_Status status;
-        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
-
-        // Find number of characters in message (including zero byte).
-        int msg_length;
-        MPI_Get_count(&status, MPI_BYTE, &msg_length);
-
-        // Get tensor name from MPI into an std::string.
-        char* buffer = new char[msg_length];
-        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
-                 &status);
-        std::string received_message(buffer);
-        delete[] buffer;
-
-        MPIResponse response;
-        response.ParseFromString(received_message);
-        if (response.response_type() == MPIResponse::DONE) {
-          // No more messages this tick
-          break;
-        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
-          // No more messages this tick, and the background thread
-          // should shut down
-          should_shut_down = true;
-          break;
-        } else {
-          // Process the current message
-          PerformCollectiveOp(mpi_global.tensor_table, response);
-        }
-      }
-    }
-  } while (!should_shut_down);
-
-  MPI_Finalize();
-}
-
-// Initialize MPI and start the MPI background thread. Ensure that this is
-// only done once no matter how many times this function is called.
-Status InitializeMPIOnce(bool gpu) {
-  // Ensure MPI is only initialized once.
-  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
-
-  mpi_global.device = -1;
-#if GOOGLE_CUDA
-  if (gpu) {
-    cudaGetDevice(&mpi_global.device);
-  }
-#endif
-
-  // Start the MPI background thread, which assumes MPI is initialized
-  // TODO: Change this to a Tensorflow thread
-  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
-
-  // Wait to ensure that the background thread has finished initializing MPI
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.cv.wait(guard);
-  if (!mpi_global.initialization_done) {
-    mpi_global.init_status =
-        errors::Unknown("Failed to wait for MPI initialization.");
-  }
-
-  return mpi_global.init_status;
-}
-
-// Check that MPI is initialized.
-Status IsMPIInitialized() {
-  if (!mpi_global.initialization_done) {
-    return errors::FailedPrecondition(
-        "MPI has not been initialized; use tf.contrib.mpi.Session.");
-  }
-  return Status::OK();
-}
-
-// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
-// only adds the op's record into the local op queue (to track the op's
-// progress), and sends a message to the coordinator indicating that this rank
-// is ready to begin. The MPI background thread will handle the MPI message.
-void EnqueueTensorCollective(CollectiveOpRecord record,
-                             MPIRequest::RequestType rtype) {
-  const Tensor* input_tensor = record.in_t;
-  MPIRequest message;
-  message.set_request_rank(record.rank);
-  message.set_tensor_name(record.name);
-  message.set_tensor_type(record.dtype);
-  message.set_request_type(rtype);
-  input_tensor->shape().AsProto(message.mutable_tensor_shape());
-
-  mutex_lock guard(mpi_global.mu);
-  mpi_global.tensor_table.emplace(record.name, record);
-  mpi_global.message_queue.push(message);
-}
-
-}  // namespace
-
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
-#endif
-
-// Op to initialize MPI in the current process. The settings used in the
-// configuration are the same that must be used for all future MPI ops.
-template <typename Device>
-class MPIInitOp : public OpKernel {
- public:
-  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool on_gpu = IsGPUDevice<Device>();
-    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
-                        MPIInitOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
-                        MPIInitOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIInit").Doc(R"doc(
-Initialize MPI for the current process.
-
-If this is run on a GPU, then that GPU must be used for all future MPI
-operations. If it is run on CPU, then all future MPI operations must also
-run on CPU.
-)doc");
-
-// Op to get the current MPI Size.
-template <typename Device>
-class MPISizeOp : public OpKernel {
- public:
-  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.size;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
-                        MPISizeOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
-                        MPISizeOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPISize")
-    .Output("size: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the number of running MPI processes.
-
-More precisely, returns the number of MPI processes in the group associated
-with the MPI_COMM_WORLD communicator.
-
-size:   Size of the MPI group.
-)doc");
-
-// Op to get the current MPI Rank.
-template <typename Device>
-class MPIRankOp : public OpKernel {
- public:
-  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
-                        MPIRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
-                        MPIRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the MPI group.
-
-More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
-communicator.
-
-rank:   Rank of the calling process.
-)doc");
-
-// Op to get the current local MPI Rank.
-template <typename Device>
-class MPILocalRankOp : public OpKernel {
- public:
-  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    OP_REQUIRES_OK(context, IsMPIInitialized());
-
-    // Write integer to output tensor
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-
-    auto flat = output->flat<int>();
-    flat(0) = mpi_global.local_rank;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
-                        MPILocalRankOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
-    MPILocalRankOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPILocalRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the node it is on.
-
-More precisely, returns the rank of the calling process in communicator that
-only spans the MPI processes running on that node.
-
-rank:   Rank of the calling process on the node it is on.
-)doc");
-
-template <typename Device>
-class MPIAllreduceOp : public AsyncOpKernel {
- public:
-  explicit MPIAllreduceOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context,
-        context->allocate_output(0, input_tensor->shape(), &output_tensor),
-        done);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.out_t = output_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-    record.dtype = input_tensor->dtype();
-
-    const size_t temp_size =
-        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
-    TensorShape temp_shape;
-    temp_shape.AddDim(temp_size);
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_temp(input_tensor->dtype(),
-                                                temp_shape, &record.temp_t),
-                         done);
-
-    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allreduce_done_callback;
-
-    auto allreduce_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allreduce; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allreduce_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allreduce_launch_callback);
-    }
-#else
-    allreduce_launch_callback();
-#endif
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
-                        MPIAllreduceOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
-                        MPIAllreduceOp<GPUDevice>);
-#endif
-
-REGISTER_OP("MPIAllreduce")
-    .Attr("T: {int32, int64, float32}")
-    .Input("tensor: T")
-    .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allreduce on a tensor. All other processes that do a reduction
-on a tensor with the same name must have the same dimension for that tensor.
-Tensors are reduced with other tensors that have the same node name for the
-allreduce.
-
-Arguments
-    tensor:     A tensor to reduce.
-
-Output
-    sum:        A tensor with the same shape as `tensor`, summed across all
-                MPI processes.
-)doc");
-
-template <typename Device>
-class MPIAllgatherOp : public AsyncOpKernel {
- public:
-  explicit MPIAllgatherOp(OpKernelConstruction* context)
-      : AsyncOpKernel(context) {}
-
-  // Although this op is handled asynchronously, the ComputeAsync call is
-  // very inexpensive. It only sets up a CollectiveOpRecord and places it
-  // in the table for the background thread to handle. Thus, we do not need
-  // a TF pool thread to perform the op.
-  bool IsExpensive() override { return false; }
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
-    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
-    const Tensor* input_tensor = &context->input(0);
-    const Tensor* sizing_tensor = &context->input(1);
-
-    // Record allocated on stack so op can fail without memory leak
-    CollectiveOpRecord record;
-    record.name = name();
-    record.context = context;
-    record.in_t = input_tensor;
-    record.on_gpu = IsGPUDevice<Device>();
-
-    // Construct the output size from the sizing tensor
-    size_t output_first_dim = 0;
-    if (sizing_tensor->shape().dims() == 0) {
-      // 0-dim sizing_tensor implies that the op is just gathering
-      // a single element from each rank
-      output_first_dim = mpi_global.size;
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(1);
-      }
-    } else {
-      // Collect the total output tensor sizing from the sizing tensor
-      // NOTE: The sizing tensor is forced to be placed on the CPU by
-      // declaring the input as HostMemory, so it is valid to read it here.
-      const int64* sizing_array =
-          (const int64*)sizing_tensor->tensor_data().data();
-      for (int i = 0; i < mpi_global.size; i++) {
-        record.sizes_vec.push_back(sizing_array[i]);
-        output_first_dim += sizing_array[i];
-      }
-    }
-
-    TensorShape output_shape;
-    output_shape.AddDim(output_first_dim);
-    for (int i = 1; i < input_tensor->shape().dims(); i++) {
-      output_shape.AddDim(input_tensor->shape().dim_size(i));
-    }
-
-    Tensor* output_tensor;
-    OP_REQUIRES_OK_ASYNC(
-        context, context->allocate_output(0, output_shape, &output_tensor),
-        done);
-
-    record.out_t = output_tensor;
-    record.dtype = input_tensor->dtype();
-
-    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
-      context->SetStatus(status.status());
-      done();
-    };
-    record.callback = allgather_done_callback;
-
-    auto allgather_launch_callback = [record] {
-      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
-    };
-
-    // If we are on a CPU, our device context will be null and we can't
-    // get a stream to enqueue this on. On a CPU this op is called when the
-    // data is already available, so we can just immediately do the
-    // allgather; we don't have to wait for the data to get populated.
-#if GOOGLE_CUDA
-    auto device_context = context->op_device_context();
-    if (device_context == nullptr) {
-      allgather_launch_callback();
-    } else {
-      auto stream = device_context->stream();
-      stream->ThenDoHostCallback(allgather_launch_callback);
-    }
-#else
-    allgather_launch_callback();
-#endif
-  }
-};
-
-REGISTER_OP("MPIAllgather")
-    .Attr("T: {int32, int64, float32}")
-    .Attr("S: {int64}")
-    .Input("tensor: T")
-    .Input("sizes: S")
-    .Output("gathered: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allgather on a tensor. All other processes that do a gather on a
-tensor with the same name must have the same rank for that tensor, and have the
-same dimension on all but the first dimension.
-
-Arguments
-    tensor:     A tensor to gather.
-    sizes:      A tensor containing the first-dimension sizes of tensors to be
-                gathered from other ranks
-
-Output
-    gathered:   A tensor with the same shape as `tensor` except for the first
-                dimension, which is the sum of dimensions in `sizes`.
-)doc");
-
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
-    MPIAllgatherOp<CPUDevice>);
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
-    MPIAllgatherOp<GPUDevice>);
-#endif
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/mpi_ops.py
deleted file mode 100644
index 81567cc..0000000
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Inter-process communication using MPI."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import load_library
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _load_library(name, op_list=None):
-  """Loads a .so file containing the specified operators.
-
-  Args:
-    name: The name of the .so file to load.
-    op_list: A list of names of operators that the library should have. If None
-        then the .so file's contents will not be verified.
-
-  Raises:
-    NameError if one of the required ops is missing.
-  """
-  try:
-    filename = resource_loader.get_path_to_datafile(name)
-    library = load_library.load_op_library(filename)
-    for expected_op in (op_list or []):
-      for lib_op in library.OP_LIST.op:
-        if lib_op.name == expected_op:
-          break
-      else:
-        raise NameError(
-          'Could not find operator %s in dynamic library %s' %
-          (expected_op, name))
-    return library
-  except errors.NotFoundError:
-    logging.warning('%s file could not be loaded.', name)
-
-
-MPI_LIB = _load_library('mpi_collectives.so', ['MPISize', 'MPIRank',
-                                               'MPILocalRank', 'MPIAllgather',
-                                               'MPIAllreduce'])
-
-
-def size(name=None):
-  """An op which returns the number of MPI processes.
-
-  This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
-  size of the global communicator.
-
-  Returns:
-    An integer scalar containing the number of MPI processes.
-  """
-  return MPI_LIB.mpi_size(name=name)
-
-
-ops.NotDifferentiable('MPISize')
-
-
-def rank(name=None):
-  """An op which returns the MPI rank of the calling process.
-
-  This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
-  rank of the current process in the global communicator.
-
-  Returns:
-    An integer scalar with the MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_rank(name=name)
-
-
-ops.NotDifferentiable('MPIRank')
-
-
-def init(name=None):
-  """An op which initializes MPI on the device on which it is run.
-
-  All future MPI ops must be run on the same device that the `init` op was run
-  on.
-  """
-  return MPI_LIB.mpi_init(name=name)
-
-
-ops.NotDifferentiable('MPIInit')
-
-
-def local_rank(name=None):
-  """An op which returns the local MPI rank of the calling process, within the
-  node that it is running on. For example, if there are seven processes running
-  on a node, their local ranks will be zero through six, inclusive.
-
-  This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
-  which only includes processes on the same node.
-
-  Returns:
-    An integer scalar with the local MPI rank of the calling process.
-  """
-  return MPI_LIB.mpi_local_rank(name=name)
-
-
-ops.NotDifferentiable('MPILocalRank')
-
-
-def _allreduce(tensor, name=None):
-  """An op which sums an input tensor over all the MPI processes.
-
-  The reduction operation is keyed by the name of the op. The tensor type and
-  shape must be the same on all MPI processes for a given name. The reduction
-  will not start until all processes are ready to send and receive the tensor.
-
-  Returns:
-    A tensor of the same shape and type as `tensor`, summed across all
-    processes.
-  """
-  return MPI_LIB.mpi_allreduce(tensor, name=name)
-
-
-ops.NotDifferentiable('MPIAllreduce')
-
-
-def allgather(tensor, name=None):
-  """An op which concatenates the input tensor with the same input tensor on
-  all other MPI processes.
-
-  The concatenation is done on the first dimension, so the input tensors on the
-  different processes must have the same rank and shape, except for the first
-  dimension, which is allowed to be different.
-
-  Returns:
-    A tensor of the same type as `tensor`, concatenated on dimension zero
-    across all processes. The shape is identical to the input shape, except for
-    the first dimension, which may be greater and is the sum of all first
-    dimensions of the tensors in different MPI processes.
-  """
-  # Specify that first allgather is to collect the tensor gather sizes,
-  # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
-  if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
-  sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
-  return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
-
-
-ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/mpi_collectives/ring.cc b/tensorflow/contrib/mpi_collectives/ring.cc
deleted file mode 100644
index d93233e..0000000
--- a/tensorflow/contrib/mpi_collectives/ring.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-extern template MPI_Datatype MPIType<float>();
-extern template MPI_Datatype MPIType<int>();
-extern template MPI_Datatype MPIType<long long>();
-extern template DataType TensorFlowDataType<float>();
-extern template DataType TensorFlowDataType<int>();
-extern template DataType TensorFlowDataType<long long>();
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Copy data on a CPU using a straight-forward memcpy.
-template <>
-void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
-  std::memcpy(dst, src, size);
-};
-
-// Accumulate values on a CPU.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    for (unsigned int i = 0; i < size; i++) {                        \
-      dst[i] += src[i];                                              \
-    }                                                                \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/ring.cu.cc
deleted file mode 100644
index 2f3eef3..0000000
--- a/tensorflow/contrib/mpi_collectives/ring.cu.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_MPI
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/contrib/mpi_collectives/ring.h"
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-template <>
-MPI_Datatype MPIType<float>() {
-  return MPI_FLOAT;
-};
-template <>
-MPI_Datatype MPIType<int>() {
-  return MPI_INT;
-};
-template <>
-MPI_Datatype MPIType<long long>() {
-  return MPI_LONG_LONG;
-};
-
-template <>
-DataType TensorFlowDataType<float>() {
-  return DT_FLOAT;
-};
-template <>
-DataType TensorFlowDataType<int>() {
-  return DT_INT32;
-};
-template <>
-DataType TensorFlowDataType<long long>() {
-  return DT_INT64;
-};
-
-// Generate all necessary specializations for RingAllreduce.
-template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              Tensor*, Tensor*);
-template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*, Tensor*,
-                                                    Tensor*);
-template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                Tensor*, Tensor*);
-
-// Generate all necessary specializations for RingAllgather.
-template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
-                                              const std::vector<size_t>&,
-                                              Tensor*);
-template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
-                                                    const Tensor*,
-                                                    const std::vector<size_t>&,
-                                                    Tensor*);
-template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
-                                                const std::vector<size_t>&,
-                                                Tensor*);
-
-// Synchronously copy data on the GPU, using a different stream than the default
-// and than TensorFlow to avoid synchronizing on operations unrelated to the
-// allreduce.
-template <>
-void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
-  auto stream = CudaStreamForMPI();
-  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
-  cudaStreamSynchronize(stream);
-};
-
-// Elementwise accumulation kernel for GPU.
-template <typename T>
-__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    out[i] += in[i];
-  }
-}
-
-// Synchronously accumulate tensors on the GPU, using a different stream than
-// the default and than TensorFlow to avoid synchronizing on operations
-// unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    auto stream = CudaStreamForMPI();                                \
-    elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size);    \
-    cudaStreamSynchronize(stream);                                   \
-  };
-GENERATE_ACCUMULATE(int);
-GENERATE_ACCUMULATE(long long);
-GENERATE_ACCUMULATE(float);
-#undef GENERATE_ACCUMULATE
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
deleted file mode 100644
index cae57ce..0000000
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_MPI_H_
-#define TENSORFLOW_CONTRIB_MPI_H_
-
-#ifdef TENSORFLOW_USE_MPI
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-
-#if GOOGLE_CUDA
-#include "cuda_runtime.h"
-#endif
-
-// Needed to avoid header issues with C++-supporting MPI implementations
-#define OMPI_SKIP_MPICXX
-#include "third_party/mpi/mpi.h"
-
-#define TAG_TENSOR 12
-
-namespace tensorflow {
-namespace contrib {
-namespace mpi {
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-// Convert from templated types to values we can pass to MPI.
-template <typename T>
-MPI_Datatype MPIType();
-
-// Convert from templated types to TensorFlow data types.
-template <typename T>
-DataType TensorFlowDataType();
-
-#define MPI_REQUIRES_OK(MPI_STATUS)                               \
-  if ((MPI_STATUS) != MPI_SUCCESS) {                              \
-    return errors::Unknown("MPI operation failed unexpectedly."); \
-  }
-
-// Copy data from one tensor to another tensor.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device>
-void CopyTensorData(void* destination, void* source, size_t size);
-
-// Add a tensor into another tensor, accumulating in place.
-// This uses a custom CUDA stream on GPU, which is necessary to overlay the
-// backpropagation computations with the allreduce.
-template <typename Device, typename T>
-void AccumulateTensorData(T* destination, T* source, size_t size);
-
-// We need to get the right stream for doing CUDA memory transfers and
-// operations, which is possibly different from the standard TensorFlow stream.
-#if GOOGLE_CUDA
-cudaStream_t CudaStreamForMPI();
-#endif
-
-/* Perform a ring allreduce on the data. Allocate the necessary output tensor
- * and store it in the output parameter.
- *
- * Assumes that all MPI processes are doing an allreduce of the same tensor,
- * with the same dimensions.
- *
- * A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
- * allreduce, the nodes involved are arranged in a ring:
- *
- *                   .--0--.
- *                  /       \
- *                 3         1
- *                  \       /
- *                   *--2--*
- *
- *  Each node always sends to the next clockwise node in the ring, and receives
- *  from the previous one.
- *
- *  The allreduce is done in two parts: a scatter-reduce and an allgather. In
- *  the scatter reduce, a reduction is done, so that each node ends up with a
- *  chunk of the final output tensor which has contributions from all other
- *  nodes.  In the allgather, those chunks are distributed among all the nodes,
- *  so that all nodes have the entire output tensor.
- *
- *  Both of these operations are done by dividing the input tensor into N
- *  evenly sized chunks (where N is the number of nodes in the ring).
- *
- *  The scatter-reduce is done in N-1 steps. In the ith step, node j will send
- *  the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
- *  its existing data for that chunk. For example, in the first iteration with
- *  the ring depicted above, you will have the following transfers:
- *
- *      Segment 0:  Node 0 --> Node 1
- *      Segment 1:  Node 1 --> Node 2
- *      Segment 2:  Node 2 --> Node 3
- *      Segment 3:  Node 3 --> Node 0
- *
- *  In the second iteration, you'll have the following transfers:
- *
- *      Segment 0:  Node 1 --> Node 2
- *      Segment 1:  Node 2 --> Node 3
- *      Segment 2:  Node 3 --> Node 0
- *      Segment 3:  Node 0 --> Node 1
- *
- *  After this iteration, Node 2 has 3 of the four contributions to Segment 0.
- *  The last iteration has the following transfers:
- *
- *      Segment 0:  Node 2 --> Node 3
- *      Segment 1:  Node 3 --> Node 0
- *      Segment 2:  Node 0 --> Node 1
- *      Segment 3:  Node 1 --> Node 2
- *
- *  After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
- *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
- * complete.
- *
- *  Next, the allgather distributes these fully accumululated chunks across all
- * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
- * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
- * For example, at the first iteration, the following transfers will occur:
- *
- *      Segment 0:  Node 3 --> Node 0
- *      Segment 1:  Node 0 --> Node 1
- *      Segment 2:  Node 1 --> Node 2
- *      Segment 3:  Node 2 --> Node 3
- *
- * After the first iteration, Node 0 will have a fully accumulated Segment 0
- * (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
- * just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
- * After this has continued for N - 1 iterations, all nodes will have a the
- * fully accumulated tensor.
- *
- * Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
- * allgather. Each send will contain K / N bytes, if there are K bytes in the
- * original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
- * bytes of data, and the performance of the allreduce (assuming no latency in
- * connections) is constrained by the slowest interconnect between the nodes.
- *
- */
-template <typename Device, typename T>
-Status RingAllreduce(OpKernelContext* context, const Tensor* input,
-                     Tensor* temp, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  T* buffer = (T*)output->tensor_data().data();
-
-  CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
-                         output->tensor_data().size());
-
-  // Calculate segment sizes and segment ends
-  const size_t elements_to_reduce = input->NumElements();
-  const size_t segment_size = elements_to_reduce / n;
-  std::vector<size_t> segment_sizes(n, segment_size);
-
-  const size_t residual = elements_to_reduce % n;
-  for (size_t i = 0; i < residual; ++i) {
-    segment_sizes[i]++;
-  }
-
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (size_t i = 1; i < segment_starts.size(); ++i) {
-    segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
-  }
-
-  assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
-
-  T* segment_recv = (T*)temp->tensor_data().data();
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  MPI_Status recv_status;
-  MPI_Request recv_req;
-
-  // Now start ring. At every step, for every rank, we iterate through
-  // segments with wraparound and send and recv from our neighbors and reduce
-  // locally. At the i'th iteration, rank r, sends segment (r-i) and receives
-  // segment (r-i-1).
-  for (int i = 0; i < n - 1; i++) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
-                              MPIType<T>(), recv_from, TAG_TENSOR,
-                              MPI_COMM_WORLD, &recv_req));
-
-    MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
-                             MPIType<T>(), send_to, TAG_TENSOR,
-                             MPI_COMM_WORLD));
-
-    T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
-
-    // Wait for recv to complete before reduction
-    MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
-
-    const size_t recv_seg_size = segment_sizes[recv_seg_id];
-    AccumulateTensorData<Device, T>(segment_update, segment_recv,
-                                    recv_seg_size);
-  }
-
-  // Now start pipelined ring allgather. At every step, for every rank, we
-  // iterate through segments with wraparound and send and recv from our
-  // neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
-  // receives segment (r-i).
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i + 1) + n) % n;
-    const size_t recv_seg_id = ((r - i) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i+1)
-    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
-
-    // Segment to recv - at every iteration we receive segment (r-i)
-    T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-// Perform a ring allgather on a Tensor. Other ranks may allgather with a
-// tensor which differs in the first dimension only; all other dimensions must
-// be the same.
-//
-// For more information on the ring allgather, read the documentation for the
-// ring allreduce, which includes a ring allgather.
-template <typename Device, typename T>
-Status RingAllgather(OpKernelContext* context, const Tensor* input,
-                     const std::vector<size_t>& sizes, Tensor* output) {
-  // Acquire MPI size and rank
-  int n, r;
-  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
-  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
-
-  assert(sizes.size() == n);
-  assert(input->dim_size(0) == sizes[r]);
-
-  // Compute number of elements in every "row". We can't compute number of
-  // elements in every chunks, because those chunks are variable length.
-  size_t elements_per_row = 1;
-  for (int i = 1; i < input->shape().dims(); i++) {
-    elements_per_row *= input->dim_size(i);
-  }
-
-  // Copy data from input tensor to correct place in output tensor.
-  std::vector<size_t> segment_starts(n);
-  segment_starts[0] = 0;
-  for (int i = 1; i < n; i++) {
-    segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
-  }
-  size_t offset = segment_starts[r];
-
-  // Copy data to the right offset for this rank.
-  T* buffer = (T*)output->tensor_data().data();
-  CopyTensorData<Device>((void*)(buffer + offset),
-                         (void*)input->tensor_data().data(),
-                         elements_per_row * sizes[r] * sizeof(T));
-
-  // Receive from your left neighbor with wrap-around
-  const size_t recv_from = ((r - 1) + n) % n;
-
-  // Send to your right neighbor with wrap-around
-  const size_t send_to = (r + 1) % n;
-
-  // Perform a ring allgather. At every step, for every rank, we iterate
-  // through segments with wraparound and send and recv from our neighbors.
-  // At the i'th iteration, rank r, sends segment (r-i) and receives segment
-  // (r-1-i).
-  MPI_Status recv_status;
-  for (size_t i = 0; i < n - 1; ++i) {
-    const size_t send_seg_id = ((r - i) + n) % n;
-    const size_t recv_seg_id = ((r - i - 1) + n) % n;
-
-    // Segment to send - at every iteration we send segment (r-i)
-    size_t offset_send = segment_starts[send_seg_id];
-    size_t rows_send = sizes[send_seg_id];
-    T* segment_send = &(buffer[offset_send]);
-
-    // Segment to recv - at every iteration we receive segment (r-1-i)
-    size_t offset_recv = segment_starts[recv_seg_id];
-    size_t rows_recv = sizes[recv_seg_id];
-    T* segment_recv = &(buffer[offset_recv]);
-
-    MPI_REQUIRES_OK(MPI_Sendrecv(
-        segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
-        TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
-        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
-  }
-
-  return Status::OK();
-}
-
-}  // namespace mpi
-}  // namespace contrib
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_MPI
-
-#undef TENSORFLOW_CONTRIB_MPI_H_
-#endif  // TENSORFLOW_CONTRIB_MPI_H_
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index 80f60ae..0000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_db_writer = functools.partial(
-        summary_ops.create_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
deleted file mode 100644
index ed0f398..0000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""SGDR learning rate decay function."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops, control_flow_ops
-
-
-def sgdr_decay(learning_rate, global_step, initial_period_steps,
-               t_mul=2.0, m_mul=1.0, name=None):
-  """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
-
-  As described in "SGDR: Stochastic Gradient Descent
-  with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
-  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
-
-  The learning rate decreases according to cosine annealing:
-
-  ```python
-  learning_rate * 0.5 * (1 + cos(x_val * pi)) # for x_val defined in [0, 1]
-  ```
-
-  Thus, at the beginning (when the restart index i = 0),
-  the learning rate decreases for `initial_period_steps` steps from the initial
-  learning rate `learning_rate` (when `x_val=0`, we get `cos(0)=1`) to
-  0 (when `x_val=1`, we get `cos(pi)=-1`).
-
-  The decrease within the i-th period takes `t_i` steps,
-  where `t_0` = `initial_period_steps` is the user-defined number of batch
-  iterations (not epochs as in the paper) to be performed before the first
-  restart is launched.
-
-  Then, we perform the first restart (i=1) by setting the learning rate to
-  `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
-  The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
-  restart runs `t_mul` times longer than the previous one.
-
-  Importantly, when one has no access to a validation set, SGDR suggests
-  to report the best expected / recommended solution in the following way:
-  When we are within our initial run (i=0), every new solution represents
-  SGDR's recommended solution. Instead, when i>0, the recommended solution is
-  the one obtained at the end of each restart.
-
-  Note that the minimum learning rate is set to 0 for simplicity,
-  you can adjust the code to deal with any positive minimum learning rate
-  as defined in the paper.
-
-  `initial_period_steps` is the duration of the first period measured in terms
-  of number of minibatch updates. If one wants to use epochs, one should compute
-  the number of updates required for an epoch.
-
-  For example, assume the following parameters and intention:
-      Minibatch size: 100
-      Training dataset size: 10000
-      If the user wants the first decay period to span across 5 epochs, then
-      `initial_period_steps` = 5 * 10000/100 = 500
-
-      Train for 10000 batch iterations with the initial learning rate set to
-      0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
-      and with the initial learning rate 0.05, then restart again and again,
-      doubling the runtime of each new period and with two times smaller
-      initial learning rate.
-
-  To accomplish the above, one would write:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate = sgdr_decay(starter_learning_rate, global_step,
-                             initial_period_steps=10000, t_mul=2, m_mul=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-
-  # Step  | 0   | 1000  | 5000 | 9000  | 9999 | 10000 | 11000  |
-  # LR    | 0.1 | 0.097 | 0.05 | 0.002 | 0.00 | 0.05  | 0.0496 |
-
-  # Step  | 20000 | 29000  | 29999 | 30000 |
-  # LR    | 0.025 | 0.0003 | 0.00  | 0.025 |
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    initial_period_steps: Duration of the first period measured as the number
-      of minibatch updates, if one wants to use epochs, one should compute
-      the number of updates required for an epoch.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the number of iterations in the i-th period:
-      `initial_period_steps * (t_mul^i)`. Defaults to 2.0.
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the initial learning rate of the i-th period:
-      `learning_rate * (m_mul^i)`. Defaults to 1.0
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.
-    The learning rate for a provided global_step.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-
-  if global_step is None:
-    raise ValueError("global_step is required for sgdr_decay.")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step,
-                       initial_period_steps, t_mul, m_mul]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
-    dtype = learning_rate.dtype
-    global_step = math_ops.cast(global_step, dtype)
-    t_0 = math_ops.cast(initial_period_steps, dtype)
-    t_mul = math_ops.cast(t_mul, dtype)
-    m_mul = math_ops.cast(m_mul, dtype)
-
-    c_one = math_ops.cast(constant_op.constant(1.0), dtype)
-    c_half = math_ops.cast(constant_op.constant(0.5), dtype)
-    c_pi = math_ops.cast(constant_op.constant(math.pi), dtype)
-
-    # Find normalized value of the current step
-    x_val = math_ops.div(global_step, t_0)
-
-    def compute_step(x_val, geometric=False):
-      if geometric:
-        # Consider geometric series where t_mul != 1
-        # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul)
-
-        # First find how many restarts were performed for a given x_val
-        # Find maximal integer i_restart value for which this equation holds
-        # x_val >= (1 - t_mul^i_restart) / (1 - t_mul)
-        # x_val * (1 - t_mul) <= (1 - t_mul^i_restart)
-        # t_mul^i_restart <= (1 - x_val * (1 - t_mul))
-
-        # tensorflow allows only log with base e
-        # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul)
-        # Find how many restarts were performed
-
-        i_restart = math_ops.floor(
-            math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul))
-        # Compute the sum of all restarts before the current one
-        sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul)
-        # Compute our position within the current restart
-        x_val = (x_val - sum_r) / t_mul ** i_restart
-
-      else:
-        # Find how many restarts were performed
-        i_restart = math_ops.floor(x_val)
-        # Compute our position within the current restart
-        x_val = x_val - i_restart
-      return i_restart, x_val
-
-    i_restart, x_val = control_flow_ops.cond(
-        math_ops.equal(t_mul, c_one),
-        lambda: compute_step(x_val, geometric=False),
-        lambda: compute_step(x_val, geometric=True))
-
-    # If m_mul < 1, then the initial learning rate of every new restart will be
-    # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart
-    m_fac = learning_rate * (m_mul ** i_restart)
-
-  return math_ops.multiply(c_half * m_fac,
-                           (math_ops.cos(x_val * c_pi) + c_one), name=name)
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
deleted file mode 100644
index 4a46e9a..0000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functional test for sgdr learning rate decay."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from sgdr_learning_rate_decay import sgdr_decay
-from tensorflow.python.platform import googletest
-from tensorflow.python.framework import test_util
-from tensorflow.python.framework import dtypes
-from tensorflow import placeholder
-
-
-class SGDRDecayTest(test_util.TensorFlowTestCase):
-  """Unit tests for SGDR learning rate decay."""
-
-  def get_original_values(self, lr, t_e, mult_factor, iter_per_epoch, epochs):
-    """Get an array with learning rate values from the consecutive steps using
-    the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    t0 = math.pi / 2.0
-    tt = 0
-    te_next = t_e
-
-    lr_values = []
-    sh_lr = lr
-    for epoch in range(epochs):
-      for _ in range(iter_per_epoch):
-        # In the original approach training function is executed here
-        lr_values.append(sh_lr)
-        dt = 2.0 * math.pi / float(2.0 * t_e)
-        tt = tt + float(dt) / iter_per_epoch
-        if tt >= math.pi:
-          tt = tt - math.pi
-        cur_t = t0 + tt
-        new_lr = lr * (1.0 + math.sin(cur_t)) / 2.0  # lr_min = 0, lr_max = lr
-        sh_lr = new_lr
-      if (epoch + 1) == te_next:  # time to restart
-        sh_lr = lr
-        tt = 0                # by setting to 0 we set lr to lr_max, see above
-        t_e = t_e * mult_factor  # change the period of restarts
-        te_next = te_next + t_e  # note the next restart's epoch
-
-    return lr_values
-
-  def get_sgdr_values(self, lr, initial_period_steps, t_mul, iters):
-    """Get an array with learning rate values from the consecutive steps
-    using current tensorflow implementation."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      decay = sgdr_decay(lr, step, initial_period_steps, t_mul)
-      lr_values = []
-      for i in range(iters):
-        lr_values.append(decay.eval(feed_dict={step: i}))
-
-      return lr_values
-
-  def testCompareToOriginal(self):
-    """Compare values generated by tensorflow implementation to the values
-    generated by the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    with self.test_session():
-      lr = 10.0
-      init_steps = 2
-      t_mul = 3
-      iters = 10
-      epochs = 50
-
-      org_lr = self.get_original_values(lr, init_steps, t_mul, iters, epochs)
-      sgdr_lr = self.get_sgdr_values(lr, init_steps*iters, t_mul, iters*epochs)
-
-      for org, sgdr in zip(org_lr, sgdr_lr):
-        self.assertAllClose(org, sgdr)
-
-  def testMDecay(self):
-    """Test m_mul argument. Check values for learning rate at the beginning
-    of the first, second, third and fourth period. """
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      lr = 0.1
-      t_e = 10
-      t_mul = 3
-      m_mul = 0.9
-
-      decay = sgdr_decay(lr, step, t_e, t_mul, m_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul)
-
-      test_step = t_e + t_e*t_mul
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul**2)
-
-      test_step = t_e + t_e*t_mul + t_e * (t_mul**2)
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * (m_mul**3))
-
-  def testCos(self):
-    """Check learning rate values at the beginning, in the middle
-    and at the end of the period."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-      lr = 0.2
-      t_e = 1000
-      t_mul = 1
-
-      decay = sgdr_decay(lr, step, t_e, t_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e*3//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
index c7f8b6c..6cd76ff 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -43,6 +43,10 @@
 
     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
+
 Some examples below.
 
 Simple indexing into a matrix:
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index c020176..162ef2b 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -50,5 +50,9 @@
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 2373254..4cb8c06 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -98,5 +98,8 @@
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
 END
 }
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 650aa42..8514d7c 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -25,7 +25,6 @@
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 // clang-format on
 
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 0ffdc42..89b23f2 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2507,36 +2507,42 @@
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
                       CopyAttrsPooling, AlwaysRewrite});
+    /*
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul,
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
+    */
     rinfo_.push_back({csinfo_.relu,
                       mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
                       CopyAttrsDataType, AlwaysRewrite});
+    /*
     rinfo_.push_back({csinfo_.tanh,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.tanh_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
                       CopyAttrsDataType, AlwaysRewrite});
+    */
     rinfo_.push_back({csinfo_.reshape,
                       mkl_op_registry::GetMklOpName(csinfo_.reshape),
                       CopyAttrsReshape, AlwaysRewrite});
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsDataType, AlwaysRewrite});
+    /*
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.sub,
                       mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
+    */
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
deleted file mode 100644
index 49beb49..0000000
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace Eigen {
-namespace internal {
-
-template <typename T>
-struct scalar_const_op {
-  typedef typename packet_traits<T>::type Packet;
-
-  const T* val;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  scalar_const_op(const scalar_const_op& x)
-      : val(x.val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const {
-    return *val;
-  }
-
-  template <typename PacketType = Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
-    return internal::pset1<PacketType>(*val);
-  }
-};
-
-template <typename T>
-struct functor_traits<scalar_const_op<T> > {
-  enum {
-    Cost = 1,
-    PacketAccess = packet_traits<T>::Vectorizable,
-    IsRepeatable = true
-  };
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-namespace tensorflow {
-
-namespace functor {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-// Partial specialization FillFunctor<Device=GPUDevice, T>
-template <typename T>
-struct FillFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-    Eigen::internal::scalar_const_op<T> f(in.data());
-    To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
-  }
-};
-
-#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
-TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
-TF_CALL_bfloat16(DEFINE_FILL_GPU);
-TF_CALL_bool(DEFINE_FILL_GPU);
-#undef DEFINE_FILL_GPU
-
-// Partial specialization of FillFunctor<Device=GPUDevice, T>.
-template <typename T>
-struct SetZeroFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
-    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
-  }
-};
-
-#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
-TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
-TF_CALL_bfloat16(DEFINE_SETZERO_GPU);
-TF_CALL_bool(DEFINE_SETZERO_GPU);
-#undef DEFINE_SETZERO_GPU
-
-// Partial specialization of FillFunctor<Device=GPUDevice, T>.
-template <typename T>
-struct SetOneFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
-    To32Bit(out).device(d) = To32Bit(out).constant(T(1));
-  }
-};
-
-#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
-TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
-TF_CALL_bfloat16(DEFINE_SETONE_GPU);
-TF_CALL_bool(DEFINE_SETONE_GPU);
-#undef DEFINE_SETONE_GPU
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 172deea..2a46494 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -541,6 +541,7 @@
                                                 int TileShortSide,
                                                 int size_of_t, Op op) {
   // clang-format off
+
   return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
                              (TileLongSide == 64   && op(TileShortSide, 4))  ||
                              (TileLongSide == 128  && op(TileShortSide, 4))  ||
@@ -568,6 +569,7 @@
                              (TileLongSide == 256  && op(TileShortSide, 8))  ||
                              (TileLongSide == 512  && op(TileShortSide, 4))  ||
                              (TileLongSide == 1024 && op(TileShortSide, 2))));
+
   // clang-format on
 }
 
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d0175df..8277179 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -650,10 +650,6 @@
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
-      // Temporary fallback to Eigen until MKLDNN Concat performance
-      // is improved. To be removed.
-      invoke_eigen = true;
-
       // Call Eigen library
       if (invoke_eigen) {
         TensorShapeList tf_input_shapes;
@@ -694,7 +690,7 @@
           // It does not matter what data format we use here (NHWC or NCHW).
           // We just need to ensure that output of Concat uses same data format
           // as input.
-                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nhwc);
+                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
 
         srcs[k].SetUsrMem(src_md, &input_tensors[k]);
         auto src_mpd = srcs[k].GetUsrMemPrimDesc();
@@ -720,7 +716,7 @@
       } else {
         // Again, format does not matter here. We just need to make it same as
         // input format.
-        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nhwc);
+        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
deleted file mode 100644
index b48c735..0000000
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include <algorithm>
-#include <vector>
-#include "tensorflow/core/framework/numeric_op.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-#include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-
-namespace tensorflow {
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-///////////////////////////////////////////////////////////
-//               Op kernel
-///////////////////////////////////////////////////////////
-
-template <typename Device, typename T>
-class MklToTfOp : public OpKernel {
- public:
-  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
-    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Check that input tensor is in MKL format.
-    const Tensor& input_tensor = MklGetInput(context, 0);
-    MklShape input_shape;
-    GetMklShape(context, 0, &input_shape);
-
-    // if input is already in Tf format, then just copy input tensor to output.
-    if (!input_shape.IsMklTensor()) {
-      context->set_output(0, input_tensor);
-      VLOG(1) << "MKLToTFConversion: No conversion needed, "
-              << "copying input to output";
-      return;
-    }
-
-    // Check that input data type is same as operator data type and that it is
-    // same as output data type.
-    DataType input_data_type = input_type(0);
-    DataType output_data_type = output_type(0);
-    CHECK_EQ(op_data_type, input_data_type);
-    CHECK_EQ(op_data_type, output_data_type);
-
-    TensorShape output_shape;
-    size_t ndims = input_shape.GetDimension();
-    size_t* in_sizes = new size_t[ndims];
-    for (size_t i = 0; i < ndims; i++) {
-      // Outermost to innermost dimension
-      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
-      in_sizes[i] = input_shape.GetSizes()[i];
-    }
-
-    // Allocate output tensor.
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape, &output_tensor));
-
-    dnnLayout_t output_layout =
-        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-    // Execute DNNConversion.
-    void* input_buffer =
-        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    delete[] in_sizes;
-    void* output_buffer =
-        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    input_shape.GetConvertedFlatData(output_layout, input_buffer,
-                                     output_buffer);
-    VLOG(1) << "MKLToTFConversion complete successfully.";
-  }
-
- private:
-  /// Data format of the operation
-  string data_format_str;
-
-  /// Data type of the operation
-  DataType op_data_type;
-
-  /// CPUIDInfo
-  bool has_avx512f_ = false;
-};
-
-///////////////////////////////////////////////////////////
-//               Register kernel
-///////////////////////////////////////////////////////////
-
-#define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklToTfOp<CPUDevice, T>);
-
-TF_CALL_float(REGISTER_CPU);
-#undef REGISTER_CPU
-}  // namespace tensorflow
-#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 0c05349..841f9dc 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -38,6 +38,7 @@
     GETATTR(int64, batch_size);
     GETATTR(string, compression_type);
 #undef GETATTR
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
 
     RecordYielder::Options yopts;
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index c7d9b97..9211a13 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1560,6 +1560,10 @@
 <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
 
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
+
 params: The tensor from which to gather values. Must be at least rank
   `axis + 1`.
 indices: Index tensor. Must be in range `[0, params.shape[axis])`.
@@ -1629,6 +1633,10 @@
 
     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
+
 Some examples below.
 
 Simple indexing into a matrix:
@@ -5413,6 +5421,9 @@
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+
 indices: Index tensor.
 updates: Updates to scatter into output.
 shape: 1-D. The shape of the resulting tensor.
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 942bca6..6d83f8b 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -67,16 +67,14 @@
       pxd_srcs.append(src)
 
   # Invoke cython to produce the shared object libraries.
-  cpp_outs = [src.split(".")[0] + ".cpp" for src in pyx_srcs]
-  native.genrule(
-      name = name + "_cython_translation",
-      srcs = pyx_srcs,
-      outs = cpp_outs,
-      cmd = ("PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS)"
-             # Rename outputs to expected location.
-             + """ && python -c 'import shutil, sys; n = len(sys.argv); [shutil.copyfile(src.split(".")[0] + ".cpp", dst) for src, dst in zip(sys.argv[1:], sys.argv[1+n//2:])]' $(SRCS) $(OUTS)"""),
-      tools = ["@cython//:cython_binary"] + pxd_srcs,
-  )
+  for filename in pyx_srcs:
+    native.genrule(
+        name = filename + "_cython_translation",
+        srcs = [filename],
+        outs = [filename.split(".")[0] + ".cpp"],
+        cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
+        tools = ["@cython//:cython_binary"] + pxd_srcs,
+    )
 
   shared_objects = []
   for src in pyx_srcs:
diff --git a/tensorflow/core/platform/vmodule_benchmark_test.cc b/tensorflow/core/platform/vmodule_benchmark_test.cc
deleted file mode 100644
index 0f9e75b..0000000
--- a/tensorflow/core/platform/vmodule_benchmark_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-
-static void BM_DisabledVlog(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    VLOG(1) << "Testing VLOG(1)!";
-  }
-}
-BENCHMARK(BM_DisabledVlog);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/vmodule_test.cc b/tensorflow/core/platform/vmodule_test.cc
deleted file mode 100644
index 47b4b2e..0000000
--- a/tensorflow/core/platform/vmodule_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Test that popens a child process with the VLOG-ing environment variable set
-// for the logging framework, and observes VLOG_IS_ON and VLOG macro output.
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/test.h"
-
-#include <string.h>
-
-namespace tensorflow {
-namespace {
-
-int RealMain(const char* argv0, bool do_vlog) {
-  if (do_vlog) {
-#if !defined(PLATFORM_GOOGLE)
-    // Note, we only test this when !defined(PLATFORM_GOOGLE) because
-    // VmoduleActivated doesn't exist in that implementation.
-    //
-    // Also, we call this internal API to simulate what would happen if
-    // differently-named translation units attempted to VLOG, so we don't need
-    // to create dummy translation unit files.
-    bool ok = internal::LogMessage::VmoduleActivated("vmodule_test.cc", 7) &&
-              internal::LogMessage::VmoduleActivated("shoobadooba.h", 3);
-    if (!ok) {
-      fprintf(stderr, "vmodule activated levels not as expected.\n");
-      return EXIT_FAILURE;
-    }
-#endif
-
-    // Print info on which VLOG levels are activated.
-    fprintf(stderr, "VLOG_IS_ON(8)? %d\n", VLOG_IS_ON(8));
-    fprintf(stderr, "VLOG_IS_ON(7)? %d\n", VLOG_IS_ON(7));
-    fprintf(stderr, "VLOG_IS_ON(6)? %d\n", VLOG_IS_ON(6));
-    // Do some VLOG-ing.
-    VLOG(8) << "VLOG(8)";
-    VLOG(7) << "VLOG(7)";
-    VLOG(6) << "VLOG(6)";
-    LOG(INFO) << "INFO";
-    return EXIT_SUCCESS;
-  }
-
-  // Popen the child process.
-  std::string command = std::string(argv0);
-#if defined(PLATFORM_GOOGLE)
-  command = command + " do_vlog --vmodule=vmodule_test=7 --alsologtostderr";
-#else
-  command =
-      "TF_CPP_VMODULE=vmodule_test=7,shoobadooba=3 " + command + " do_vlog";
-#endif
-  command += " 2>&1";
-  fprintf(stderr, "Running: \"%s\"\n", command.c_str());
-  FILE* f = popen(command.c_str(), "r");
-  if (f == nullptr) {
-    fprintf(stderr, "Failed to popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Read data from the child's stdout.
-  constexpr int kBufferSizeBytes = 4096;
-  char buffer[kBufferSizeBytes];
-  size_t result = fread(buffer, sizeof(buffer[0]), kBufferSizeBytes - 1, f);
-  if (result == 0) {
-    fprintf(stderr, "Failed to read from child stdout: %zu %s\n", result,
-            strerror(errno));
-    return EXIT_FAILURE;
-  }
-  buffer[result] = '\0';
-  int status = pclose(f);
-  if (status == -1) {
-    fprintf(stderr, "Failed to close popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Check output is as expected.
-  const char kExpected[] =
-      "VLOG_IS_ON(8)? 0\nVLOG_IS_ON(7)? 1\nVLOG_IS_ON(6)? 1\n";
-  if (strstr(buffer, kExpected) == nullptr) {
-    fprintf(stderr, "error: unexpected output from child: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-  bool ok = strstr(buffer, "VLOG(7)\n") != nullptr &&
-            strstr(buffer, "VLOG(6)\n") != nullptr &&
-            strstr(buffer, "VLOG(8)\n") == nullptr;
-  if (!ok) {
-    fprintf(stderr, "error: VLOG output not as expected: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-
-  // Success!
-  return EXIT_SUCCESS;
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  bool do_vlog = argc >= 2 && strcmp(argv[1], "do_vlog") == 0;
-  return tensorflow::RealMain(argv[0], do_vlog);
-}
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
index d0de831..379c3f1 100644
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -1,6 +1,6 @@
 ## Auto Detect and Advise
 
-tfprof analyzes profiles and generates advises for common issues.
+tfprof analyzes profiles and generates advice for common issues.
 
 ### Run Advise.
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c037a9b..3baab75 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 4
+#define TF_MINOR_VERSION 5
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md
index 81fb1e1..4e51ada 100644
--- a/tensorflow/docs_src/api_guides/cc/guide.md
+++ b/tensorflow/docs_src/api_guides/cc/guide.md
@@ -1,6 +1,6 @@
 # C++ API
 
-Note: By default [tensorflow.org](http://tensorflow.org) shows docs for the
+Note: By default [tensorflow.org](https://www.tensorflow.org) shows docs for the
 most recent stable version. The instructions in this doc require building from
 source. You will probably want to build from the `master` version of tensorflow.
 You should, as a result, be sure you are following the
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index c52279b..15075e1 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -1,6 +1,6 @@
 # Adding a New Op
 
-Note: By default [tensorflow.org](http://tensorflow.org) shows docs for the
+Note: By default [www.tensorflow.org](https://www.tensorflow.org) shows docs for the
 most recent stable version. The instructions in this doc require building from
 source. You will probably want to build from the `master` version of tensorflow.
 You should, as a result, be sure you are following the
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index c4fc882..3c84886 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -4,7 +4,7 @@
 operating systems:
 
   * MacOS X 10.11 (El Capitan) or later.
-  * Ubuntu 14.04 or later
+  * Ubuntu 16.04 or later
   * Windows 7 or later.
 
 Although you might be able to install TensorFlow on other laptop or desktop
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index df622c6..d79cd14 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 8b3da49..49f5350 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index d189fa4..47b1251 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -17,7 +17,7 @@
 (and we only support) these instructions on machines meeting the
 following requirements:
 
-  * Ubuntu 14.04 or higher; 64-bit, x86
+  * Ubuntu 16.04 or higher; 64-bit, x86
   * macOS X 10.11 (El Capitan) or higher
   * Windows 7 or higher; 64-bit, x86
 
@@ -36,7 +36,7 @@
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.4.0</version>
+  <version>1.5.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.4.0</version>
+                 <version>1.5.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -147,7 +147,7 @@
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.4.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.4.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.4.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index e3d5b80..275ff8c 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -6,7 +6,7 @@
 following requirements:
 
   * 64-bit desktops or laptops
-  * Ubuntu 14.04 or higher
+  * Ubuntu 16.04 or higher
 
 
 ## Determine which TensorFlow to install
@@ -188,7 +188,7 @@
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index d4ab547..926ceae 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index e453bd6..90e93f5 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -25,8 +25,10 @@
 following:
 
 *   [Bazel on Windows](https://bazel.build/versions/master/docs/windows.html)
-*   [TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/r0.12/tensorflow/contrib/cmake)
+*   [TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/cmake)
 
+Note: Starting from 1.6 release, our prebuilt binaries will use AVX
+instructions. Older CPUs may not be able to execute these binaries.
 
 ## Determine which TensorFlow to install
 
@@ -359,10 +361,10 @@
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.4.0 on Linux:
+for TensorFlow 1.5.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 3692a02..67975402 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,21 +83,21 @@
 ## Code Organization
 
 The code for this tutorial resides in
-[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
-[`cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
 
 
 ## CIFAR-10 Model
 
 The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py).
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
 The complete training
 graph contains roughly 765 operations. We find that we can make the code most
 reusable by constructing the graph with the following modules:
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 52e6980..df15bc0 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -390,7 +390,7 @@
 python tensorflow/examples/label_image/label_image.py \
 --graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
 --input_layer=input \
---output_layer=final_result:0 \
+--output_layer=final_result \
 --input_height=224 --input_width=224 \
 --input_mean=128 --input_std=128 \
 --image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 0a1c41c..3fe7352 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -23,7 +23,7 @@
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -341,7 +341,7 @@
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -357,7 +357,7 @@
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -385,13 +385,13 @@
 custom data reader for your problem, as described in
 @{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 @{$adding_an_op$Adding a New Op}.  Again we've provided an
 example of this for the Skip-Gram case
-[models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
+[models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
deleted file mode 100644
index 13bc463..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow;
-
-/**
- * Interface implemented by operands of a TensorFlow operation.
- *
- * <p>Example usage:
- *
- * <pre>{@code
- * // The "decodeJpeg" operation can be used as input to the "cast" operation
- * Input decodeJpeg = ops.image().decodeJpeg(...);
- * ops.math().cast(decodeJpeg, DataType.FLOAT);
- *
- * // The output "y" of the "unique" operation can be used as input to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
- *
- * // The "split" operation can be used as input list to the "concat" operation
- * Iterable<? extends Input> split = ops.array().split(...);
- * ops.array().concat(0, split);
- * }</pre>
- */
-public interface Input<T> {
-
-  /**
-   * Returns the symbolic handle of a tensor.
-   *
-   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
-   * used to obtain a symbolic handle that represents the computation of the input.
-   *
-   * @see OperationBuilder#addInput(Output)
-   */
-  Output<T> asOutput();
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
deleted file mode 100644
index ab34f6a..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a boolean. */
-public class TFBool implements TFType {
-  private TFBool() {}
-  static {
-    Types.typeCodes.put(TFBool.class, DataType.BOOL);
-  }
-  static {
-    Types.scalars.put(TFBool.class, false);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
deleted file mode 100644
index 49e5d9f..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit double precision floating point number. */
-public class TFDouble implements TFType {
-  private TFDouble() {}
-  static {
-    Types.typeCodes.put(TFDouble.class, DataType.DOUBLE);
-  }
-  static {
-    Types.scalars.put(TFDouble.class, 0.0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
deleted file mode 100644
index 8426ee4..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit single precision floating point number. */
-public class TFFloat implements TFType {
-  private TFFloat() {}
-  static {
-    Types.typeCodes.put(TFFloat.class, DataType.FLOAT);
-  }
-  static {
-    Types.scalars.put(TFFloat.class, 0f);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
deleted file mode 100644
index 3947b6a..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit signed integer. */
-public class TFInt32 implements TFType {
-  private TFInt32() {}
-  static {
-    Types.typeCodes.put(TFInt32.class, DataType.INT32);
-  }
-  static {
-    Types.scalars.put(TFInt32.class, 0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
deleted file mode 100644
index ccdded8..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit signed integer. */
-public class TFInt64 implements TFType {
-  private TFInt64() {}
-  static {
-    Types.typeCodes.put(TFInt64.class, DataType.INT64);
-  }
-  static {
-    Types.scalars.put(TFInt64.class, 0L);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
deleted file mode 100644
index e7327e8..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an arbitrary sequence of bytes. */
-public class TFString implements TFType {
-  private TFString() {}
-  static {
-    Types.typeCodes.put(TFString.class, DataType.STRING);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
deleted file mode 100644
index 562953a..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-/**
- * A marker interface for classes representing TensorFlow types.
- */
-public interface TFType {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
deleted file mode 100644
index d7305ca..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an 8-bit unsigned integer. */
-public class TFUInt8 implements TFType {
-  private TFUInt8() {}
-  static {
-    Types.typeCodes.put(TFUInt8.class, DataType.UINT8);
-  }
-  static {
-    Types.scalars.put(TFUInt8.class, (byte)0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java b/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
deleted file mode 100644
index 976cd9f..0000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-import java.util.HashMap;
-import java.util.Map;
-import org.tensorflow.DataType;
-
-/**
- * Utility class for managing the representation of TensorFlow types as Java
- * types. For each TensorFlow type (e.g., int32), there is a corresponding Java
- * type (e.g., TFInt32) that represents it at compile time and a corresponding
- * class object (e.g., TFInt32.class) that represents it at run time. There is
- * also an enumeration value in DataType that can be used to represent the
- * type, though that should rarely be required.
- */
-public class Types {
-
-  private Types() {} // not instantiable
-
-  static final Map<Class<?>, DataType> typeCodes = new HashMap<>();
-
-  /** Returns the DataType value corresponding to a TensorFlow type class. */
-  public static DataType dataType(Class<? extends TFType> c) {
-    DataType dtype = typeCodes.get(c);
-    if (dtype == null) {
-      throw new IllegalArgumentException("" + c + " is not a TensorFlow type.");
-    }
-    return dtype;
-  }
-
-  static final Map<Class<?>, Object> scalars = new HashMap<>();
-
-  /** Returns the zero value of type described by {@code c}, or null if
-   *  the type (e.g., string) is not numeric and therefore has no zero value.
-   */
-  public static Object zeroValue(Class<? extends TFType> c) {
-    return scalars.get(c);
-  }
-}
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c62ff10..97467c5 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1238,6 +1238,7 @@
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -3506,6 +3507,7 @@
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":lib",
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 962318e..3fac2e5 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -73,6 +73,7 @@
         self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter,
         pass_through_operrors=pass_through_operrors)
 
+    session_root = os.path.expanduser(session_root)
     if gfile.Exists(session_root):
       if not gfile.IsDirectory(session_root):
         raise ValueError(
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index c46a4e7..1465cb7 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -82,6 +82,7 @@
     if not dump_root:
       self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
     else:
+      dump_root = os.path.expanduser(dump_root)
       if os.path.isfile(dump_root):
         raise ValueError("dump_root path points to a file: %s" % dump_root)
       elif os.path.isdir(dump_root) and os.listdir(dump_root):
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index a06feb1..048dc92 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -550,7 +550,7 @@
 
 
 def val_and_grad_function(f, params=None):
-  """Returns a function that computes f and is derivative w.r.t. params.
+  """Returns a function that computes f and its derivative w.r.t. params.
 
   Example:
   ```python
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 2d3f5d6..4f7da84 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -326,7 +326,7 @@
       mock_executor.assert_called_with(estimator=mock_est,
                                        train_spec=mock_train_spec,
                                        eval_spec=mock_eval_spec)
-      mock_executor_instance.run.assert_called()
+      self.assertTrue(mock_executor_instance.run.called)
 
   def test_error_out_if_evaluator_task_id_is_non_zero(self):
     tf_config = {
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1ff1968..65cc6ff 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -181,7 +181,7 @@
 
 
 def _compute_dx_and_dy(x, y, y_shape):
-  """Returns a node to compute gradient of x wrt y."""
+  """Returns a node to compute gradient of y wrt x."""
   # We make up a dy so that we can compute the gradients. We don't really use
   # the value of dy -- we will always feed it. We need to add an identity node
   # so that we can always feed it properly. Otherwise, for the Add operation,
@@ -189,7 +189,7 @@
   with x.graph.as_default():
     dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype)
     dy = array_ops.identity(dy_orig)
-  # We compute the gradients for x wrt. y
+  # We compute the gradients for y wrt. x
   grads = gradients.gradients(y, x, dy)
   assert len(grads) == 1
   return grads[0], dy_orig
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 7f494db..9bebffd 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -999,8 +999,8 @@
 
   Args:
     image : A Tensor.
-    gamma : A scalar. Non negative real number.
-    gain  : A scalar. The constant multiplier.
+    gamma : A scalar or tensor. Non negative real number.
+    gain  : A scalar or tensor. The constant multiplier.
 
   Returns:
     A Tensor. Gamma corrected output image.
@@ -1019,17 +1019,20 @@
   """
 
   with ops.op_scope([image, gamma, gain], None, 'adjust_gamma'):
-    # Convert pixel value to DT_FLOAT for computing adjusted image
+    # Convert pixel value to DT_FLOAT for computing adjusted image.
     img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
-    # Keep image dtype for computing the scale of corresponding dtype
+    # Keep image dtype for computing the scale of corresponding dtype.
     image = ops.convert_to_tensor(image, name='image')
 
-    if gamma < 0:
-      raise ValueError('Gamma should be a non-negative real number')
-    # scale = max(dtype) - min(dtype)
+    assert_op = _assert(gamma >= 0, ValueError,
+                        'Gamma should be a non-negative real number.')
+    if assert_op:
+      gamma = control_flow_ops.with_dependencies(assert_op, gamma)
+   
+    # scale = max(dtype) - min(dtype).
     scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
                                  dtype=dtypes.float32)
-    # According to the definition of gamma correction
+    # According to the definition of gamma correction.
     adjusted_img = (img / scale) ** gamma * scale * gain
 
     return adjusted_img
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 3d73b77..3a49d41 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -189,6 +189,44 @@
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  def test_adjust_gamma_less_zero(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+    
+      x = constant_op.constant(x_np, shape=x_np.shape)
+
+      err_msg = 'Gamma should be a non-negative real number.'
+      
+      try:
+        image_ops.adjust_gamma(x, gamma=-1)
+      except Exception as e:
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+
+  def test_adjust_gamma_less_zero_tensor(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+    
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = constant_op.constant(-1.0, dtype=dtypes.float32)
+      
+      image = image_ops.adjust_gamma(x, gamma=y)
+      
+      err_msg = 'Gamma should be a non-negative real number.'
+      try:
+        image.eval()
+      except Exception as e:
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+      
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
     with self.test_session():
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 60a32b1..879c206 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -886,11 +886,6 @@
   # Build appropriately shaped IndexedSlices
   handle = op.inputs[0]
   indices = op.inputs[1]
-  if context.in_graph_mode():
-    # Walk graph back until the original handle is found.
-    # TODO(apassos): implement this for EAGER mode.
-    while handle.op.type != "VarHandleOp":
-      handle = handle.op.inputs[0]
   params_shape = gen_resource_variable_ops.variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 6e846ef..bfaa044 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -47,8 +47,6 @@
 
 ./configure
 make altinstall
-pip3.6 -V
-which pip3.6
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
 pip3 install --upgrade virtualenv
@@ -73,7 +71,7 @@
 
 pip3 install scipy==0.18.1
 
-pip3 install scikit-learn==0.18.1
+pip3 install scikit-learn==0.19.1
 
 # pandas required by `inflow`
 pip3 install pandas==0.19.2
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
deleted file mode 100644
index e7e4c91..0000000
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Upgrader for Python scripts according to an API change specification."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
-
-
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
-
-  This class must provide the following fields:
-
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `function_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-
-  For an example, see `TFAPIChangeSpec`.
-  """
-
-
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
-
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
-
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "", "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name:
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class ASTCodeUpgrader(object):
-  """Handles upgrading a set of Python files using a given API change spec."""
-
-  def __init__(self, api_change_spec):
-    if not isinstance(api_change_spec, APIChangeSpec):
-      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
-                      type(api_change_spec))
-    self._api_change_spec = api_change_spec
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base.
-      copy_other_files: Copy files that are not touched by this converter.
-
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    files_to_copy = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      copy_files = [f for f in file_list if not f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-      if copy_other_files:
-        for filename in copy_files:
-          fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
-          files_to_copy.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    for input_path, output_path in files_to_copy:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      shutil.copy(input_path, output_path)
-    return file_count, report, tree_errors
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index cd22f18..5dc4a05 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 8180e5e..3c15fc9 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.4
+ARG TF_BRANCH=r1.5
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index d0c540ae..07ffd38 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
deleted file mode 100644
index 3bedc8c..0000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ /dev/null
@@ -1,115 +0,0 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
-
-# It is possible to override these for releases.
-ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
-ARG TF_AVAILABLE_CPUS=32
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        golang \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        python-pip \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip --no-cache-dir install --upgrade \
-        pip setuptools
-
-RUN pip --no-cache-dir install \
-        ipykernel \
-        jupyter \
-        matplotlib \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        wheel \
-        && \
-    python -m ipykernel.kernelspec
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    wget --quiet https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    wget --quiet https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout ${TF_BRANCH}
-WORKDIR /tensorflow
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} \
-    CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu \
-    PYTHON_BIN_PATH=/usr/bin/python \
-    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=9.0 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0 \
-    TF_CUDNN_VERSION=7
-RUN ./configure
-
-# Build and Install TensorFlow.
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    bazel build -c opt \
-                --config=cuda \
-                --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-                --jobs=${TF_AVAILABLE_CPUS} \
-                tensorflow/tools/pip_package:build_pip_package && \
-    mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg && \
-    pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
-    rm -rf /pip_pkg && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index f2845c8..3630dbd 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -16,7 +16,10 @@
 """Help include git hash in tensorflow bazel build.
 
 This creates symlinks from the internal git repository directory so
-that the build system can see changes in the version state.
+that the build system can see changes in the version state. We also
+remember what branch git was on so when the branch changes we can
+detect that the ref file is no longer correct (so we can suggest users
+run ./configure again).
 
 NOTE: this script is only used in opensource.
 
@@ -218,14 +221,13 @@
   if not data["git"]:
     git_version = b"unknown"
   else:
-    old_branch = data["branch"]		
+    old_branch = data["branch"]
     new_branch = parse_branch_ref(head_symlink)
     if new_branch != old_branch:
-      print("Warning, run ./configure again, to get __git_version__ to record "
-            "correct version")
-      git_version = get_git_version(data["path"])+'-inconsistent-git-version'
-    else:
-      git_version = get_git_version(data["path"])
+      raise RuntimeError(
+          "Run ./configure again, branch was '%s' but is now '%s'" %
+          (old_branch, new_branch))
+    git_version = get_git_version(data["path"])
   write_version_info(dest_file, git_version)
 
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 72116f7..c32461d 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -154,6 +154,7 @@
             "//tensorflow:tensorflow_py",
             "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
             "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+            "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
             "//tensorflow/contrib/data/python/ops:prefetching_py",
             "//tensorflow/contrib/eager/python/examples:examples_pip",
             "//tensorflow/contrib/eager/python:checkpointable",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e03faee..2e31d6e 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.4.0'
+_VERSION = '1.5.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4f50d9d..6ad42d9 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -97,11 +97,11 @@
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/c2947c341c68.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/c2947c341c68.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/034b6c3e1017.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/034b6c3e1017.tar.gz",
       ],
-      sha256 = "f21f8ab8a8dbcb91cd0deeade19a043f47708d0da7a4000164cdf203b4a71e34",
-      strip_prefix = "eigen-eigen-c2947c341c68",
+      sha256 = "0a8ac1e83ef9c26c0e362bd7968650b710ce54e2d883f0df84e5e45a3abe842a",
+      strip_prefix = "eigen-eigen-034b6c3e1017",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch
deleted file mode 100644
index 8b42d10..0000000
--- a/third_party/boringssl/add_boringssl_s390x.patch
+++ /dev/null
@@ -1,133 +0,0 @@
-diff --git a/src/include/openssl/base.h b/src/include/openssl/base.h
-index 7a3adfb..88012ad 100644
---- a/src/include/openssl/base.h
-+++ b/src/include/openssl/base.h
-@@ -94,6 +94,8 @@ extern "C" {
- #define OPENSSL_PNACL
- #elif defined(__myriad2__)
- #define OPENSSL_32_BIT
-+#elif defined(__s390x__)
-+#define OPENSSL_64_BIT
- #else
- #error "Unknown target CPU"
- #endif
-diff --git a/BUILD b/BUILD
-index 6b645e61..c90b7beb 100644
---- a/BUILD
-+++ b/BUILD
-@@ -40,29 +40,46 @@ config_setting(
-     values = {"cpu": "darwin"},
- )
- 
--boringssl_copts = [
--    # Assembler option --noexecstack adds .note.GNU-stack to each object to
--    # ensure that binaries can be built with non-executable stack.
--    "-Wa,--noexecstack",
--
--    # This is needed on Linux systems (at least) to get rwlock in pthread.
--    "-D_XOPEN_SOURCE=700",
--
--    # This list of warnings should match those in the top-level CMakeLists.txt.
--    "-Wall",
--    "-Werror",
--    "-Wformat=2",
--    "-Wsign-compare",
--    "-Wmissing-field-initializers",
--    "-Wwrite-strings",
--    "-Wshadow",
--    "-fno-common",
--
--    # Modern build environments should be able to set this to use atomic
--    # operations for reference counting rather than locks. However, it's
--    # known not to work on some Android builds.
--    # "-DOPENSSL_C11_ATOMIC",
--] + select({
-+config_setting(
-+    name = "windows",
-+    values = {"cpu": "x64_windows"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+config_setting(
-+    name = "windows_msvc",
-+    values = {"cpu": "x64_windows_msvc"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+boringssl_copts = select({
-+    ":windows": [
-+        "-DWIN32_LEAN_AND_MEAN",
-+    ],
-+    "//conditions:default": [
-+        # Assembler option --noexecstack adds .note.GNU-stack to each object to
-+        # ensure that binaries can be built with non-executable stack.
-+        "-Wa,--noexecstack",
-+
-+        # This is needed on Linux systems (at least) to get rwlock in pthread.
-+        "-D_XOPEN_SOURCE=700",
-+
-+        # This list of warnings should match those in the top-level CMakeLists.txt.
-+        "-Wall",
-+        "-Werror",
-+        "-Wformat=2",
-+        "-Wsign-compare",
-+        "-Wmissing-field-initializers",
-+        "-Wwrite-strings",
-+        "-Wshadow",
-+        "-fno-common",
-+
-+        # Modern build environments should be able to set this to use atomic
-+        # operations for reference counting rather than locks. However, it's
-+        # known not to work on some Android builds.
-+        # "-DOPENSSL_C11_ATOMIC",
-+    ],
-+}) + select({
-     ":linux_x86_64": [],
-     ":mac_x86_64": [],
-     "//conditions:default": ["-DOPENSSL_NO_ASM"],
-@@ -75,18 +92,26 @@ crypto_sources_asm = select({
- })
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_c11 = boringssl_copts + [
--    "-std=c11",
--    "-Wmissing-prototypes",
--    "-Wold-style-definition",
--    "-Wstrict-prototypes",
--]
-+boringssl_copts_c11 = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c11",
-+        "-Wmissing-prototypes",
-+        "-Wold-style-definition",
-+        "-Wstrict-prototypes",
-+    ],
-+})
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_cxx = boringssl_copts + [
--    "-std=c++11",
--    "-Wmissing-declarations",
--]
-+boringssl_copts_cxx = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c++11",
-+        "-Wmissing-declarations",
-+    ],
-+})
- 
- cc_library(
-     name = "crypto",
-@@ -96,6 +121,8 @@ cc_library(
-     includes = ["src/include"],
-     linkopts = select({
-         ":mac_x86_64": [],
-+        ":windows": [],
-+        ":windows_msvc": [],
-         "//conditions:default": ["-lpthread"],
-     }),
-     visibility = ["//visibility:public"],
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
index bd197bf..47e2125 100644
--- a/third_party/git/git_configure.bzl
+++ b/third_party/git/git_configure.bzl
@@ -1,4 +1,31 @@
-"""Repository rule for Git autoconfiguration."""
+"""Repository rule for Git autoconfiguration.
+
+`git_configure` depends on the following environment variables:
+
+  * `PYTHON_BIN_PATH`: location of python binary.
+"""
+
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+
+def _fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _get_python_bin(repository_ctx):
+  """Gets the python bin path."""
+  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+  if python_bin != None:
+    return python_bin
+  python_bin_path = repository_ctx.which("python")
+  if python_bin_path != None:
+    return str(python_bin_path)
+  _fail("Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
+
 
 def _git_conf_impl(repository_ctx):
   repository_ctx.template(
@@ -11,10 +38,18 @@
       Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"))
   generated_files_path = repository_ctx.path("gen")
 
-  repository_ctx.execute([
+  result = repository_ctx.execute([
+      _get_python_bin(repository_ctx),
       python_script_path, "--configure", tensorflow_root_path,
       "--gen_root_path", generated_files_path], quiet=False)
 
+  if not result.return_code == 0:
+    _fail(result.stderr)
+
+
 git_configure = repository_rule(
     implementation = _git_conf_impl,
+    environ = [
+        _PYTHON_BIN_PATH,
+    ],
 )
diff --git a/third_party/nanopb.BUILD b/third_party/nanopb.BUILD
deleted file mode 100644
index d218669..0000000
--- a/third_party/nanopb.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-# Description:
-#   Nanopb, a tiny ANSI C protobuf implementation for use on embedded devices.
-
-licenses(["notice"])  # zlib license
-
-exports_files(["LICENSE.txt"])
-
-cc_library(
-    name = "nanopb",
-    srcs = [
-        "pb_common.c",
-        "pb_decode.c",
-        "pb_encode.c",
-    ],
-    hdrs = [
-        "pb.h",
-        "pb_common.h",
-        "pb_decode.h",
-        "pb_encode.h",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index c29fef9..11e9c84 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -22,6 +22,14 @@
 def _is_windows(ctx):
   return ctx.os.name.lower().find("windows") != -1
 
+def _wrap_bash_cmd(ctx, cmd):
+  if _is_windows(ctx):
+    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  return cmd
+
 def _get_env_var(ctx, name):
   if name in ctx.os.environ:
     return ctx.os.environ[name]
@@ -46,12 +54,8 @@
   # Don't check patch on Windows, because patch is only available under bash.
   if not _is_windows(ctx) and not ctx.which("patch"):
     fail("patch command is not found, please install it")
-  cmd = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
-  if _is_windows(ctx):
-    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
-    if not bazel_sh:
-      fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  cmd = _wrap_bash_cmd(
+    ctx, ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)])
   _execute_and_check_ret_code(ctx, cmd)
 
 def _apply_delete(ctx, paths):
@@ -60,8 +64,8 @@
       fail("refusing to rm -rf path starting with '/': " + path)
     if ".." in path:
       fail("refusing to rm -rf path containing '..': " + path)
-  _execute_and_check_ret_code(
-      ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+  cmd = _wrap_bash_cmd(ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+  _execute_and_check_ret_code(ctx, cmd)
 
 def _tf_http_archive(ctx):
   if ("mirror.bazel.build" not in ctx.attr.urls[0] or