Merge pull request #21486 from weidankong:elastic_average

PiperOrigin-RevId: 209198285
diff --git a/README.md b/README.md
index 82de010..669ff5b 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,8 @@
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
+TensorFlow provides stable Python API and C APIs as well as without API backwards compatibility guarantee like C++, Go, Java, JavaScript and Swift.
+
 Keep up to date with release announcements and security updates by
 subscribing to
 [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
@@ -81,13 +83,13 @@
 
 | Build Type      | Status | Artifacts |
 | ---             | ---    | ---       |
-| **Linux CPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
-| **Linux GPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
-| **Linux XLA**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.png) | TBA |
-| **MacOS**       | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
-| **Windows CPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.png) | [pypi](https://pypi.org/project/tf-nightly/) |
-| **Windows GPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
-| **Android**     | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.png) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
+| **Linux CPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Linux GPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Linux XLA**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html) | TBA |
+| **MacOS**       | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Android**     | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 
 ### Community Supported Builds
@@ -97,17 +99,20 @@
 | **IBM s390x**       | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA |
 | **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA |
 | **IBM ppc64le GPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/) | TBA |
-| **Linux CPU with Intel® MKL-DNN®** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | TBA |
+| **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) |
+| **Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br>  **Linux CPU with Intel® MKL-DNN** Python 3.6| ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)|[1.9.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.9.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp35-cp35m-linux_x86_64.whl)<br>[1.9.0 py3.6](https://storage.cloud.google.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp36-cp36m-linux_x86_64.whl) |
 
 
 ## For more information
-
+* [Tensorflow Blog](https://medium.com/tensorflow)
+* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
+* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
+* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
+* [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
+* [Tensorflow Twitter](https://twitter.com/tensorflow)
 * [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
-* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
-* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
-* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/RELEASE.md b/RELEASE.md
index 078aafd..763ef3b 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3,7 +3,7 @@
 ## Major Features And Improvements
 
 * The `tf.lite` runtime now supports `complex64`.
-* Initial Bigtable integration for `tf.data`.
+* Initial [Google Cloud Bigtable integration](https://github.com/tensorflow/tensorflow/tree/r1.10/tensorflow/contrib/bigtable) for `tf.data`.
 * Improved local run behavior in `tf.estimator.train_and_evaluate` which does not reload checkpoints for evaluation.
 * `RunConfig` now sets device_filters to restrict how workers and PS can communicate. This can speed up training and ensure clean shutdowns in some situations. But if you have jobs that require communication between workers, you will have to set custom session_options in your `RunConfig`.
 * Moved Distributions and Bijectors from `tf.contrib.distributions` to [Tensorflow Probability (TFP)](https://github.com/tensorflow/probability). `tf.contrib.distributions` is now deprecated and will be removed by the end of 2018.
@@ -19,7 +19,7 @@
 * `tf.data`:
   * `tf.contrib.data.group_by_reducer()` is now available via the public API.
   * `tf.contrib.data.choose_from_datasets()` is now available via the public API.
-  * Adding `drop_remainder` argument to `tf.data.Dataset.batch()` and `tf.data.Dataset.padded_batch()`, deprecating tf.contrib.data.batch_and_drop_remainder()` and `tf.contrib.data.padded_batch_and_drop_remainder()`.
+  * Adding `drop_remainder` argument to `tf.data.Dataset.batch()` and `tf.data.Dataset.padded_batch()`, deprecating `tf.contrib.data.batch_and_drop_remainder()` and `tf.contrib.data.padded_batch_and_drop_remainder()`.
 * `tf.estimator`:
   * `Estimator`s now use custom savers included in `EstimatorSpec` scaffolds for saving SavedModels during export.
   * `EstimatorSpec` will now add a default prediction output for export if no `export_output` is provided, eliminating the need to explicitly include a `PredictOutput` object in the `model_fn` for simple use-cases.
diff --git a/configure.py b/configure.py
index f97bf8a..79293d1 100644
--- a/configure.py
+++ b/configure.py
@@ -839,15 +839,16 @@
       cuda_toolkit_path = cygpath(cuda_toolkit_path)
 
     if is_windows():
-      cuda_rt_lib_path = 'lib/x64/cudart.lib'
+      cuda_rt_lib_paths = ['lib/x64/cudart.lib']
     elif is_linux():
-      cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version
+      cuda_rt_lib_paths = ['%s/libcudart.so.%s' % (x, tf_cuda_version)
+                           for x in ['lib64', 'lib/x86_64-linux-gnu']]
     elif is_macos():
-      cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version
+      cuda_rt_lib_paths = ['lib/libcudart.%s.dylib' % tf_cuda_version]
 
-    cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path)
-    if os.path.exists(cuda_toolkit_path_full):
-      break
+    cuda_toolkit_paths_full = [os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths]
+    if any([os.path.exists(x) for x in cuda_toolkit_paths_full]):
+        break
 
     # Reset and retry
     print('Invalid path to CUDA %s toolkit. %s cannot be found' %
@@ -1398,10 +1399,6 @@
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
-def set_build_strip_flag():
-  write_to_bazelrc('build --strip=always')
-
-
 def set_windows_build_flags(environ_cp):
   """Set Windows specific build options."""
   # The non-monolithic build is not supported yet
@@ -1558,7 +1555,6 @@
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
-  set_build_strip_flag()
   if is_windows():
     set_windows_build_flags(environ_cp)
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e13a5cf..94e059b 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -124,12 +124,6 @@
 )
 
 config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
     name = "no_tensorflow_py_deps",
     define_values = {"no_tensorflow_py_deps": "true"},
     visibility = ["//visibility:public"],
@@ -387,6 +381,7 @@
     define_values = {
         "dynamic_loaded_kernels": "true",
     },
+    visibility = ["//visibility:public"],
 )
 
 config_setting(
@@ -429,12 +424,12 @@
 
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "if_mkl_ml",
 )
 
 filegroup(
     name = "intel_binary_blob",
-    data = if_mkl(
+    data = if_mkl_ml(
         [
             "//third_party/mkl:intel_binary_blob",
         ],
@@ -487,7 +482,6 @@
     linkopts = select({
         "//tensorflow:darwin": [],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_framework_version_script.lds)",
@@ -529,7 +523,6 @@
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
@@ -554,7 +547,6 @@
             "$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
@@ -584,6 +576,7 @@
 gen_api_init_files(
     name = "tensorflow_python_api_gen",
     srcs = ["api_template.__init__.py"],
+    api_version = 1,
     root_init_template = "api_template.__init__.py",
 )
 
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index bb9433c..73fe737 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1619,5 +1619,66 @@
   TF_DeleteFunction(func1);
 }
 
+// This test only works when the TF build includes XLA compiler. One way to set
+// this up is via bazel build option "--define with_xla_support=true".
+//
+// FIXME: generalize the macro name TENSORFLOW_EAGER_USE_XLA to
+// something like TENSORFLOW_CAPI_USE_XLA.
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST_F(CApiFunctionTest, StatelessIf_XLA) {
+  TF_Function* func;
+  const std::string funcName = "BranchFunc";
+  DefineFunction(funcName.c_str(), &func);
+  TF_GraphCopyFunction(host_graph_, func, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* feed = Placeholder(host_graph_, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* true_cond = ScalarConst(true, host_graph_, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_OperationDescription* desc =
+      TF_NewOperation(host_graph_, "StatelessIf", "IfNode");
+  TF_AddInput(desc, {true_cond, 0});
+  TF_Output inputs[] = {{feed, 0}};
+  TF_AddInputList(desc, inputs, TF_ARRAYSIZE(inputs));
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_SetAttrType(desc, "Tcond", TF_BOOL);
+  TF_DataType inputType = TF_INT32;
+  TF_SetAttrTypeList(desc, "Tin", &inputType, 1);
+  TF_SetAttrTypeList(desc, "Tout", &inputType, 1);
+  TF_SetAttrFuncName(desc, "then_branch", funcName.data(), funcName.size());
+  TF_SetAttrFuncName(desc, "else_branch", funcName.data(), funcName.size());
+  TF_SetDevice(desc, "/device:XLA_CPU:0");
+  auto op = TF_FinishOperation(desc, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  ASSERT_NE(op, nullptr);
+
+  // Create a session for this graph.
+  CSession csession(host_graph_, s_, /*use_XLA*/ true);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Run the graph.
+  csession.SetInputs({{feed, Int32Tensor(17)}});
+  csession.SetOutputs({op});
+  csession.Run(s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Tensor* out = csession.output_tensor(0);
+  ASSERT_TRUE(out != nullptr);
+  EXPECT_EQ(TF_INT32, TF_TensorType(out));
+  EXPECT_EQ(0, TF_NumDims(out));  // scalar
+  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
+  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
+  EXPECT_EQ(-17, *output_contents);
+
+  // Clean up
+  csession.CloseAndDelete(s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_DeleteFunction(func);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 24eb6c0..f15d9ee 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -26,6 +26,10 @@
 using tensorflow::GraphDef;
 using tensorflow::NodeDef;
 
+static void BoolDeallocator(void* data, size_t, void* arg) {
+  delete[] static_cast<bool*>(data);
+}
+
 static void Int32Deallocator(void* data, size_t, void* arg) {
   delete[] static_cast<int32_t*>(data);
 }
@@ -38,6 +42,14 @@
   delete[] static_cast<float*>(data);
 }
 
+TF_Tensor* BoolTensor(bool v) {
+  const int num_bytes = sizeof(bool);
+  bool* values = new bool[1];
+  values[0] = v;
+  return TF_NewTensor(TF_BOOL, nullptr, 0, values, num_bytes, &BoolDeallocator,
+                      nullptr);
+}
+
 TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
   int64_t num_values = 1;
   for (int i = 0; i < num_dims; ++i) {
@@ -131,6 +143,12 @@
   return op;
 }
 
+TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s,
+                          const char* name) {
+  unique_tensor_ptr tensor(BoolTensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
 TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
                           const char* name) {
   unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 38313d6..7eeb1ee 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -31,6 +31,8 @@
 typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
     unique_tensor_ptr;
 
+TF_Tensor* BoolTensor(int32_t v);
+
 // Create a tensor with values of type TF_INT8 provided by `values`.
 TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values);
 
@@ -55,6 +57,9 @@
 TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
                     const char* name = "const");
 
+TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
 TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
                           const char* name = "scalar");
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a0a4444..dfb1c9a 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -110,7 +110,7 @@
 
 tensorflow::Status CreateRemoteContexts(
     const std::vector<string>& remote_workers, int64 rendezvous_id,
-    const tensorflow::ServerDef& server_def,
+    int keep_alive_secs, const tensorflow::ServerDef& server_def,
     tensorflow::eager::EagerClientCache* remote_eager_workers, bool async,
     tensorflow::gtl::FlatMap<string, tensorflow::uint64>* remote_contexts) {
   for (int i = 0; i < remote_workers.size(); i++) {
@@ -129,6 +129,7 @@
     request.mutable_server_def()->set_job_name(parsed_name.job);
     request.mutable_server_def()->set_task_index(parsed_name.task);
     request.set_async(async);
+    request.set_keep_alive_secs(keep_alive_secs);
     auto* eager_client = remote_eager_workers->GetClient(remote_worker);
     if (eager_client == nullptr) {
       return tensorflow::errors::Internal(
@@ -151,7 +152,8 @@
 }
 
 tensorflow::Status UpdateTFE_ContextWithServerDef(
-    const tensorflow::ServerDef& server_def, TFE_Context* ctx) {
+    int keep_alive_secs, const tensorflow::ServerDef& server_def,
+    TFE_Context* ctx) {
   // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
   // server object (which currently CHECK-fails) and we miss the error, instead,
   // we log the error, and then return to allow the user to see the error
@@ -202,8 +204,8 @@
   // Initialize remote eager workers.
   tensorflow::gtl::FlatMap<string, tensorflow::uint64> remote_contexts;
   LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
-      remote_workers, rendezvous_id, server_def, remote_eager_workers.get(),
-      ctx->context.Async(), &remote_contexts));
+      remote_workers, rendezvous_id, keep_alive_secs, server_def,
+      remote_eager_workers.get(), ctx->context.Async(), &remote_contexts));
 
   tensorflow::RemoteRendezvous* r =
       grpc_server->worker_env()->rendezvous_mgr->Find(rendezvous_id);
@@ -222,9 +224,10 @@
 
   auto* device_mgr = grpc_server->worker_env()->device_mgr;
 
-  ctx->context.InitializeRemote(
-      std::move(server), std::move(remote_eager_workers),
-      std::move(remote_device_mgr), remote_contexts, r, device_mgr);
+  ctx->context.InitializeRemote(std::move(server),
+                                std::move(remote_eager_workers),
+                                std::move(remote_device_mgr), remote_contexts,
+                                r, device_mgr, keep_alive_secs);
 
   return tensorflow::Status::OK();
 #undef LOG_AND_RETURN_IF_ERROR
@@ -288,6 +291,7 @@
 
 // Set server_def on the context, possibly updating it.
 TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
+                                                   int keep_alive_secs,
                                                    const void* proto,
                                                    size_t proto_len,
                                                    TF_Status* status) {
@@ -297,7 +301,8 @@
         "Invalid tensorflow.ServerDef protocol buffer");
     return;
   }
-  status->status = UpdateTFE_ContextWithServerDef(server_def, ctx);
+  status->status =
+      UpdateTFE_ContextWithServerDef(keep_alive_secs, server_def, ctx);
 }
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
@@ -719,6 +724,10 @@
 }
 }  // namespace
 
+void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context.StartStep(); }
+
+void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context.EndStep(); }
+
 namespace tensorflow {
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const tensorflow::AttrValue& default_value,
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 25cf7ad..a0ebc6f 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -124,6 +124,7 @@
 // If the following is set, all servers identified by the
 // ServerDef must be up when the context is created.
 TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
+                                                   int keep_alive_secs,
                                                    const void* proto,
                                                    size_t proto_len,
                                                    TF_Status* status);
@@ -380,6 +381,16 @@
                                                         TF_Buffer* buf,
                                                         TF_Status* status);
 
+// Some TF ops need a step container to be set to limit the lifetime of some
+// resources (mostly TensorArray and Stack, used in while loop gradients in
+// graph mode). Calling this on a context tells it to start a step.
+TF_CAPI_EXPORT extern void TFE_ContextStartStep(TFE_Context* ctx);
+
+// Ends a step. When there is no active step (that is, every started step has
+// been ended) step containers will be cleared. Note: it is not safe to call
+// TFE_ContextEndStep while ops which rely on the step container may be running.
+TF_CAPI_EXPORT extern void TFE_ContextEndStep(TFE_Context* ctx);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 00a0a71..7126227 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -151,7 +151,7 @@
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_ContextSetServerDef(ctx, serialized.data(), serialized.size(), status);
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
@@ -239,7 +239,7 @@
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_ContextSetServerDef(ctx, serialized.data(), serialized.size(), status);
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
@@ -371,7 +371,7 @@
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_ContextSetServerDef(ctx, serialized.data(), serialized.size(), status);
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const char remote_device_name[] =
@@ -397,7 +397,7 @@
   ASSERT_TRUE(s.ok()) << s.error_message();
   ASSERT_TRUE(worker_server->Start().ok());
 
-  TFE_ContextSetServerDef(ctx, serialized.data(), serialized.size(), status);
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   // Create a new tensor_handle.
@@ -1471,4 +1471,61 @@
 }
 BENCHMARK(BM_ReadVariable);
 
+TEST(CAPI, StringAttributes) {
+  // Test that TFE_OpSetAttrString doesn't hold on to the value after it
+  // returns.
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::vector<int64_t> dims(4, 1);
+  TFE_Op* op = TFE_NewOp(ctx, "AvgPool", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* tensor =
+      TF_AllocateTensor(TF_FLOAT, dims.data(), dims.size(), sizeof(float));
+  float tensor_data[] = {1};
+  memcpy(TF_TensorData(tensor), tensor_data, TF_TensorByteSize(tensor));
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandle(tensor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, tensor_handle, status);
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(tensor_handle);
+
+  std::vector<int64_t> values(4, 1);
+  TFE_OpSetAttrIntList(op, "ksize", values.data(), values.size());
+  TFE_OpSetAttrIntList(op, "strides", values.data(), values.size());
+
+  const int BUFFER_SIZE = 10;
+  char buffer[BUFFER_SIZE];
+  std::strncpy(buffer, "VALID", BUFFER_SIZE);
+  TFE_OpSetAttrString(op, "padding", buffer, std::strlen(buffer));
+  // Overwriting value in "buffer", should be fine since TFE_Op
+  // shouldn't be holding on to it.
+  std::strncpy(buffer, "NHWC", BUFFER_SIZE);
+  TFE_OpSetAttrString(op, "data_format", buffer, std::strlen(buffer));
+
+  TFE_OpSetAttrType(op, "T", TF_FLOAT);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(op, &retvals[0], &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  tensor = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(4, TF_TensorByteSize(tensor));
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(op);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
 }  // namespace
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 588a45e..f56521da 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -379,9 +379,11 @@
     srcs = ["gradients/math_grad_test.cc"],
     deps = [
         ":cc_ops",
+        ":client_session",
         ":grad_op_registry",
         ":grad_testutil",
         ":gradient_checker",
+        ":gradients",
         ":math_grad",
         ":testutil",
         "//tensorflow/core:lib_internal",
@@ -626,7 +628,6 @@
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//tensorflow:darwin": [
             "-lm",
             "-lpthread",
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 35a01e0..5dcf008 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -441,6 +441,22 @@
 }
 REGISTER_GRADIENT_OP("RealDiv", RealDivGrad);
 
+Status UnsafeDivGrad(const Scope& scope, const Operation& op,
+                     const std::vector<Output>& grad_inputs,
+                     std::vector<Output>* grad_outputs) {
+  auto x_1 = ConjugateHelper(scope, op.input(0));
+  auto x_2 = ConjugateHelper(scope, op.input(1));
+  // y = x_1 / x_2
+  // dy/dx_1 = 1/x_2
+  // dy/dx_2 = -x_1/x_2^2
+  auto gx_1 = UnsafeDiv(scope, grad_inputs[0], x_2);
+  auto gx_2 =
+      Mul(scope, grad_inputs[0],
+          UnsafeDiv(scope, UnsafeDiv(scope, Neg(scope, x_1), x_2), x_2));
+  return BinaryGradCommon(scope, op, grad_outputs, gx_1, gx_2);
+}
+REGISTER_GRADIENT_OP("UnsafeDiv", UnsafeDivGrad);
+
 Status SquaredDifferenceGrad(const Scope& scope, const Operation& op,
                              const std::vector<Output>& grad_inputs,
                              std::vector<Output>* grad_outputs) {
@@ -1007,6 +1023,26 @@
 }
 REGISTER_GRADIENT_OP("Prod", ProdGrad);
 
+Status SegmentSumGrad(const Scope& scope, const Operation& op,
+                      const std::vector<Output>& grad_inputs,
+                      std::vector<Output>* grad_outputs) {
+  // The SegmentSum operation sums segments of the Tensor that have the same
+  // index in the segment_ids parameter.
+  // i.e z = [2, 3, 4, 5], segment_ids [0, 0, 0, 1]
+  // will produce [2 + 3 + 4, 5] = [9, 5]
+  // The gradient that will flow back to the gather operation will look like
+  // [x1, x2], it will have the same shape as the output of the SegmentSum
+  // operation. The differentiation step of the SegmentSum operation just
+  // broadcast the gradient in order to retrieve the z's shape.
+  // dy/dz = [x1, x1, x1, x2]
+  grad_outputs->push_back(Gather(scope, grad_inputs[0], op.input(1)));
+
+  // stop propagation along segment_ids
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("SegmentSum", SegmentSumGrad);
+
 // MatMulGrad helper function used to compute two MatMul operations
 // based on input matrix transposition combinations.
 Status MatMulGradHelper(const Scope& scope, const bool is_batch,
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 1c9bdff..88aef1f 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -13,8 +13,10 @@
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/grad_op_registry.h"
 #include "tensorflow/cc/framework/gradient_checker.h"
+#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/framework/testutil.h"
 #include "tensorflow/cc/gradients/grad_testutil.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -42,9 +44,11 @@
 using ops::Pow;
 using ops::Prod;
 using ops::RealDiv;
+using ops::SegmentSum;
 using ops::SquaredDifference;
 using ops::Sub;
 using ops::Sum;
+using ops::UnsafeDiv;
 
 // TODO(andydavis) Test gradient function against numeric gradients output.
 // TODO(andydavis) As more gradients are added move common test functions
@@ -850,6 +854,36 @@
   RunTest({x}, {x_shape}, {y}, {x_shape});
 }
 
+TEST_F(NaryGradTest, UnsafeDiv) {
+  {
+    TensorShape x_shape({3, 2, 5});
+    const auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+    // Test x / (1 + |x|) rather than x_1 / x_2 to avoid triggering large
+    // division errors in the numeric estimator used by the gradient checker.
+    const auto y = UnsafeDiv(
+        scope_, x, Add(scope_, Const<float>(scope_, 1), Abs(scope_, x)));
+    RunTest({x}, {x_shape}, {y}, {x_shape});
+  }
+  {
+    // Return 0 gradient (rather than NaN) for division by zero.
+    const auto x = Placeholder(scope_, DT_FLOAT);
+    const auto zero = Const<float>(scope_, 0.0);
+    const auto y = UnsafeDiv(scope_, x, zero);
+
+    std::vector<Output> grad_outputs;
+    TF_EXPECT_OK(AddSymbolicGradients(scope_, {y}, {x}, &grad_outputs));
+    ClientSession session(scope_);
+    std::vector<Tensor> grad_result;
+    TF_EXPECT_OK(
+        session.Run({{x, {-3.0f, 0.0f, 3.0f}}}, grad_outputs, &grad_result));
+    EXPECT_EQ(grad_result.size(), 1);
+    EXPECT_EQ(grad_result[0].NumElements(), 3);
+    EXPECT_EQ(grad_result[0].flat<float>()(0), 0.0f);
+    EXPECT_EQ(grad_result[0].flat<float>()(1), 0.0f);
+    EXPECT_EQ(grad_result[0].flat<float>()(2), 0.0f);
+  }
+}
+
 TEST_F(NaryGradTest, SquaredDifference) {
   TensorShape x1_shape({3, 2, 5});
   TensorShape x2_shape({2, 5});
@@ -898,5 +932,14 @@
   RunTest({x}, {x_shape}, {y}, {y_shape});
 }
 
+TEST_F(NaryGradTest, SegmentSum) {
+  TensorShape x_shape({3, 4});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = SegmentSum(scope_, x, {0, 0, 1});
+  // the sum is always on the first dimension
+  TensorShape y_shape({2, 4});
+  RunTest({x}, {x_shape}, {y}, {y_shape});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 98be66a..3830416 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -170,7 +170,8 @@
       variables_directory, MetaFilename(kSavedModelVariablesFilename));
   if (!Env::Default()->FileExists(variables_index_path).ok()) {
     LOG(INFO) << "The specified SavedModel has no variables; no checkpoints "
-                 "were restored.";
+                 "were restored. File does not exist: "
+              << variables_index_path;
     return Status::OK();
   }
   const string variables_path =
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index d2f803b..1899a32 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -48,6 +48,7 @@
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/cpu:buffer_info_util",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 8dbe1e1..89fefda 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -24,6 +24,7 @@
 #include "tensorflow/compiler/tf2xla/str_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,6 +37,8 @@
 
 namespace {
 
+using BufferInfo = cpu_function_runtime::BufferInfo;
+
 bool IsAlpha(char c) {
   return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 }
@@ -85,27 +88,36 @@
   return Status::OK();
 }
 
-// total_buffer_bytes returns the sum of each size in `sizes`, skipping -1
-// values.  There are `n` entries in `sizes`.
-size_t total_buffer_bytes(const intptr_t* sizes, size_t n) {
-  size_t total = 0;
-  for (size_t i = 0; i < n; ++i) {
-    if (sizes[i] != -1) {
-      total += sizes[i];
-    }
-  }
-  return total;
+// Returns the sum of the size of each buffer in `buffer_infos`.
+size_t TotalBufferBytes(const std::vector<BufferInfo>& buffer_infos) {
+  return std::accumulate(buffer_infos.begin(), buffer_infos.end(), size_t{0},
+                         [](size_t size, const BufferInfo& buffer_info) {
+                           return size + buffer_info.size();
+                         });
 }
 
-// Fills in arg_sizes with the byte size of each positional arg.
-Status ComputeArgSizes(const CompileResult& compile_result,
-                       std::vector<int64>* arg_sizes) {
-  const xla::ProgramShape& ps = compile_result.program_shape;
-  for (int i = 0; i < ps.parameters_size(); ++i) {
-    arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
-        ps.parameters(i), compile_result.pointer_size));
-  }
-  return Status::OK();
+// Returns a vector of BufferInfo instances in `buffer_infos` that are entry
+// parameter buffers.
+std::vector<BufferInfo> ExtractEntryParamBufferInfos(
+    const std::vector<BufferInfo>& buffer_infos) {
+  std::vector<BufferInfo> result;
+  std::copy_if(buffer_infos.begin(), buffer_infos.end(),
+               std::back_inserter(result), [](const BufferInfo& buffer_info) {
+                 return buffer_info.is_entry_parameter();
+               });
+  return result;
+}
+
+// Returns a vector of BufferInfo instances in `buffer_infos` that are temp
+// buffers.
+std::vector<BufferInfo> ExtractTempBufferInfos(
+    const std::vector<BufferInfo>& buffer_infos) {
+  std::vector<BufferInfo> result;
+  std::copy_if(buffer_infos.begin(), buffer_infos.end(),
+               std::back_inserter(result), [](const BufferInfo& buffer_info) {
+                 return buffer_info.is_temp_buffer();
+               });
+  return result;
 }
 
 // Add (from,to) rewrite pairs based on the given shape.  These rewrite pairs
@@ -278,6 +290,25 @@
   return Status::OK();
 }
 
+// Returns a list of C++ expressions that, when executed, will construct the
+// BufferInfo instances in `buffer_infos`.
+std::vector<string> BufferInfosToCppExpression(
+    const std::vector<BufferInfo>& buffer_infos) {
+  std::vector<string> buffer_infos_as_strings;
+  std::transform(buffer_infos.begin(), buffer_infos.end(),
+                 std::back_inserter(buffer_infos_as_strings),
+                 [](const BufferInfo& buffer_info) {
+                   std::pair<uint64, uint64> encoded = buffer_info.Encode();
+                   string encoded_second_as_str =
+                       encoded.second == ~0ULL
+                           ? "~0ULL"
+                           : strings::StrCat(encoded.second, "ULL");
+                   return strings::StrCat(
+                       "::tensorflow::cpu_function_runtime::BufferInfo({",
+                       encoded.first, "ULL, ", encoded_second_as_str, "})");
+                 });
+  return buffer_infos_as_strings;
+}
 }  // namespace
 
 Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
@@ -286,29 +317,35 @@
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
   const int64 result_index = compile_result.aot->result_buffer_index();
-  const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes();
-  if (result_index < 0 || result_index >= temp_sizes.size()) {
+  const std::vector<BufferInfo>& buffer_infos =
+      compile_result.aot->buffer_infos();
+  const std::vector<int32> arg_index_table =
+      ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
+  std::vector<string> buffer_infos_as_strings =
+      BufferInfosToCppExpression(buffer_infos);
+  if (result_index < 0 || result_index >= buffer_infos.size()) {
     return errors::InvalidArgument("result index: ", result_index,
                                    " is outside the range of temp sizes: [0,",
-                                   temp_sizes.size(), ")");
+                                   buffer_infos.size(), ")");
   }
 
   // Compute sizes and generate methods.
-  std::vector<int64> arg_sizes;
-  TF_RETURN_IF_ERROR(ComputeArgSizes(compile_result, &arg_sizes));
+  std::vector<BufferInfo> buffer_infos_for_args =
+      ExtractEntryParamBufferInfos(buffer_infos);
+  std::vector<BufferInfo> buffer_infos_for_temps =
+      ExtractTempBufferInfos(buffer_infos);
   const xla::ProgramShape& ps = compile_result.program_shape;
   string methods_arg, methods_result;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
-  const std::vector<intptr_t> iarg(arg_sizes.begin(), arg_sizes.end());
-  const std::vector<intptr_t> itemp(temp_sizes.begin(), temp_sizes.end());
-  const size_t arg_bytes_aligned =
-      cpu_function_runtime::AlignedBufferBytes(iarg.data(), iarg.size());
-  const size_t arg_bytes_total = total_buffer_bytes(iarg.data(), iarg.size());
-  const size_t temp_bytes_aligned =
-      cpu_function_runtime::AlignedBufferBytes(itemp.data(), itemp.size());
-  const size_t temp_bytes_total =
-      total_buffer_bytes(itemp.data(), itemp.size());
+  const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes(
+      buffer_infos_for_args.data(), buffer_infos_for_args.size(),
+      /*allocate_entry_params=*/true);
+  const size_t arg_bytes_total = TotalBufferBytes(buffer_infos_for_args);
+  const size_t temp_bytes_aligned = cpu_function_runtime::AlignedBufferBytes(
+      buffer_infos_for_temps.data(), buffer_infos_for_temps.size(),
+      /*allocate_entry_params=*/true);
+  const size_t temp_bytes_total = TotalBufferBytes(buffer_infos_for_temps);
 
   // Create rewrite strings for namespace start and end.
   string ns_start;
@@ -343,8 +380,8 @@
   // calling HloProfilePrinter::profile_counters_size.
   const string assign_profile_counters_size =
       opts.gen_hlo_profile_printer_data
-          ? "data->profile_counters_size = "
-            "data->hlo_profile_printer_data->profile_counters_size();"
+          ? "data->set_profile_counters_size("
+            "data->hlo_profile_printer_data()->profile_counters_size());"
           : "";
 
   // Use a poor-man's text templating mechanism; first populate the full header
@@ -414,9 +451,8 @@
   static constexpr size_t kNumArgs = {{ARG_NUM}};
 
   // Byte size of each argument buffer. There are kNumArgs entries.
-  static const intptr_t* ArgSizes() {
-    static constexpr intptr_t kArgSizes[kNumArgs] = {{{ARG_SIZES}}};
-    return kArgSizes;
+  static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) {
+    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
   }
 
   // Returns static data used to create an XlaCompiledCpuFunction.
@@ -424,17 +460,17 @@
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->raw_function = {{ENTRY}};
-      data->arg_sizes = ArgSizes();
-      data->num_args = kNumArgs;
-      data->temp_sizes = TempSizes();
-      data->num_temps = kNumTemps;
-      data->result_index = kResultIndex;
-      data->arg_names = StaticArgNames();
-      data->result_names = StaticResultNames();
-      data->program_shape = StaticProgramShape();
-      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
-      {{ASSIGN_PROFILE_COUNTERS_SIZE}}
+      data->set_raw_function({{ENTRY}});
+      data->set_buffer_infos(BufferInfos());
+      data->set_num_buffers(kNumBuffers);
+      data->set_arg_index_table(ArgIndexToBufferIndex());
+      data->set_num_args(kNumArgs);
+      data->set_result_index(kResultIndex);
+      data->set_arg_names(StaticArgNames());
+      data->set_result_names(StaticResultNames());
+      data->set_program_shape(StaticProgramShape());
+      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+{{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
     return *kStaticData;
@@ -482,17 +518,27 @@
 {{METHODS_RESULT}}
 
  private:
-  // Number of result and temporary buffers for the compiled computation.
-  static constexpr size_t kNumTemps = {{TEMP_NUM}};
+  // Number of buffers for the compiled computation.
+  static constexpr size_t kNumBuffers = {{NUM_BUFFERS}};
+
+  static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() {
+    static const ::tensorflow::cpu_function_runtime::BufferInfo
+      kBufferInfos[kNumBuffers] = {
+{{BUFFER_INFOS_AS_STRING}}
+      };
+    return kBufferInfos;
+  }
+
+  static const ::tensorflow::int32* ArgIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
+{{ARG_INDEX_TABLE}}
+    };
+    return kArgIndexToBufferIndex;
+  }
+
   // The 0-based index of the result tuple in the temporary buffers.
   static constexpr size_t kResultIndex = {{RESULT_INDEX}};
 
-  // Byte size of each result / temporary buffer. There are kNumTemps entries.
-  static const intptr_t* TempSizes() {
-    static constexpr intptr_t kTempSizes[kNumTemps] = {{{TEMP_SIZES}}};
-    return kTempSizes;
-  }
-
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {{ARG_NAMES_CODE}}
 
@@ -523,8 +569,8 @@
       {"{{ARG_BYTES_ALIGNED}}", strings::StrCat(arg_bytes_aligned)},
       {"{{ARG_BYTES_TOTAL}}", strings::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
-      {"{{ARG_NUM}}", strings::StrCat(arg_sizes.size())},
-      {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
+      {"{{ARG_NUM}}", strings::StrCat(arg_index_table.size())},
+      {"{{ARG_INDEX_TABLE}}", str_util::Join(arg_index_table, ", ")},
       {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
       {"{{DECLS_FROM_OBJ_FILE}}",
@@ -546,8 +592,9 @@
       {"{{RESULT_NAMES_CODE}}", result_names_code},
       {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)},
       {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)},
-      {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())},
-      {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")}};
+      {"{{NUM_BUFFERS}}", strings::StrCat(buffer_infos.size())},
+      {"{{BUFFER_INFOS_AS_STRING}}",
+       str_util::Join(buffer_infos_as_strings, ",\n")}};
   str_util::ReplaceAllPairs(header, rewrites);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 29bc9c1..60d59ae 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -32,6 +32,8 @@
 namespace tfcompile {
 namespace {
 
+using ::tensorflow::cpu_function_runtime::BufferInfo;
+
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
   EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
@@ -171,8 +173,14 @@
   fetch->mutable_id()->set_node_name("fetch0");
   fetch->set_name("myfetch");
   CompileResult compile_result;
-  compile_result.aot.reset(
-      new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5, {}));
+  compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
+      {},
+      {BufferInfo::MakeTempBuffer(1),
+       BufferInfo::MakeEntryParameter(/*size=*/8, /*param_number=*/0),
+       BufferInfo::MakeTempBuffer(2),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
+       BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)},
+      5, {}));
   compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 6641d45e..e4d8a02 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -65,9 +65,8 @@
   static constexpr size_t kNumArgs = 2;
 
   // Byte size of each argument buffer. There are kNumArgs entries.
-  static const intptr_t* ArgSizes() {
-    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96};
-    return kArgSizes;
+  static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) {
+    return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
   }
 
   // Returns static data used to create an XlaCompiledCpuFunction.
@@ -75,17 +74,17 @@
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->raw_function = entry_point;
-      data->arg_sizes = ArgSizes();
-      data->num_args = kNumArgs;
-      data->temp_sizes = TempSizes();
-      data->num_temps = kNumTemps;
-      data->result_index = kResultIndex;
-      data->arg_names = StaticArgNames();
-      data->result_names = StaticResultNames();
-      data->program_shape = StaticProgramShape();
-      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
-      
+      data->set_raw_function(entry_point);
+      data->set_buffer_infos(BufferInfos());
+      data->set_num_buffers(kNumBuffers);
+      data->set_arg_index_table(ArgIndexToBufferIndex());
+      data->set_num_args(kNumArgs);
+      data->set_result_index(kResultIndex);
+      data->set_arg_names(StaticArgNames());
+      data->set_result_names(StaticResultNames());
+      data->set_program_shape(StaticProgramShape());
+      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+
       return data;
     }();
     return *kStaticData;
@@ -215,17 +214,32 @@
   }
 
  private:
-  // Number of result and temporary buffers for the compiled computation.
-  static constexpr size_t kNumTemps = 6;
+  // Number of buffers for the compiled computation.
+  static constexpr size_t kNumBuffers = 6;
+
+  static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() {
+    static const ::tensorflow::cpu_function_runtime::BufferInfo
+      kBufferInfos[kNumBuffers] = {
+::tensorflow::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
+::tensorflow::cpu_function_runtime::BufferInfo({34ULL, 0ULL}),
+::tensorflow::cpu_function_runtime::BufferInfo({9ULL, ~0ULL}),
+::tensorflow::cpu_function_runtime::BufferInfo({386ULL, 1ULL}),
+::tensorflow::cpu_function_runtime::BufferInfo({13ULL, ~0ULL}),
+::tensorflow::cpu_function_runtime::BufferInfo({481ULL, ~0ULL})
+      };
+    return kBufferInfos;
+  }
+
+  static const ::tensorflow::int32* ArgIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
+1, 3
+    };
+    return kArgIndexToBufferIndex;
+  }
+
   // The 0-based index of the result tuple in the temporary buffers.
   static constexpr size_t kResultIndex = 5;
 
-  // Byte size of each result / temporary buffer. There are kNumTemps entries.
-  static const intptr_t* TempSizes() {
-    static constexpr intptr_t kTempSizes[kNumTemps] = {1, -1, 2, -1, 3, 120};
-    return kTempSizes;
-  }
-
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {
     static const char* kNames[] = {"myfeed", nullptr};
diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc
index 6b09804..5deb47d 100644
--- a/tensorflow/compiler/aot/test.cc
+++ b/tensorflow/compiler/aot/test.cc
@@ -51,11 +51,9 @@
 namespace tfcompile {
 namespace {
 
-void zero_buffers(void** bufs, const intptr_t* sizes, size_t n) {
-  for (int i = 0; i < n; ++i) {
-    if (sizes[i] != -1) {
-      memset(bufs[i], 0, sizes[i]);
-    }
+void zero_buffers(XlaCompiledCpuFunction* computation) {
+  for (int i = 0; i < computation->num_args(); ++i) {
+    memset(computation->arg_data(i), 0, computation->arg_size(i));
   }
 }
 
@@ -66,7 +64,7 @@
 
   CPP_CLASS computation;
   computation.set_thread_pool(&device);
-  zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs);
+  zero_buffers(&computation);
 
   EXPECT_TRUE(computation.Run());
 }
@@ -80,7 +78,7 @@
 
   CPP_CLASS computation;
   computation.set_thread_pool(&device);
-  zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs);
+  zero_buffers(&computation);
 
   testing::StartTiming();
   while (--iters) {
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index fee4628..0c0c676 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -44,8 +44,8 @@
 
 TEST(TFCompileTest, Add) {
   AddComp add;
-  EXPECT_EQ(add.arg0_data(), add.args()[0]);
-  EXPECT_EQ(add.arg1_data(), add.args()[1]);
+  EXPECT_EQ(add.arg0_data(), add.arg_data(0));
+  EXPECT_EQ(add.arg1_data(), add.arg_data(1));
 
   add.arg0() = 1;
   add.arg1() = 2;
@@ -67,10 +67,10 @@
   EXPECT_EQ(add_const.error_msg(), "");
   EXPECT_EQ(add_const.arg0(), 123);
   EXPECT_EQ(add_const.arg0_data()[0], 123);
-  EXPECT_EQ(add_const.arg0_data(), add.args()[0]);
+  EXPECT_EQ(add_const.arg0_data(), add.arg_data(0));
   EXPECT_EQ(add_const.arg1(), 456);
   EXPECT_EQ(add_const.arg1_data()[0], 456);
-  EXPECT_EQ(add_const.arg1_data(), add.args()[1]);
+  EXPECT_EQ(add_const.arg1_data(), add.arg_data(1));
   EXPECT_EQ(add_const.result0(), 579);
   EXPECT_EQ(add_const.result0_data()[0], 579);
   EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
@@ -85,8 +85,8 @@
   int32 arg_y = 32;
   add.set_arg0_data(&arg_x);
   add.set_arg1_data(&arg_y);
-  EXPECT_EQ(add.arg0_data(), add.args()[0]);
-  EXPECT_EQ(add.arg1_data(), add.args()[1]);
+  EXPECT_EQ(add.arg0_data(), add.arg_data(0));
+  EXPECT_EQ(add.arg1_data(), add.arg_data(1));
 
   EXPECT_TRUE(add.Run());
   EXPECT_EQ(add.error_msg(), "");
@@ -97,7 +97,7 @@
 
 TEST(TFCompileTest, AddWithCkpt) {
   AddWithCkptComp add;
-  EXPECT_EQ(add.arg0_data(), add.args()[0]);
+  EXPECT_EQ(add.arg0_data(), add.arg_data(0));
 
   add.arg0() = 1;
   EXPECT_TRUE(add.Run());
@@ -117,7 +117,7 @@
   EXPECT_EQ(add_const.error_msg(), "");
   EXPECT_EQ(add_const.arg0(), 111);
   EXPECT_EQ(add_const.arg0_data()[0], 111);
-  EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]);
+  EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0));
   EXPECT_EQ(add_const.result0(), 153);
   EXPECT_EQ(add_const.result0_data()[0], 153);
   EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
@@ -125,7 +125,7 @@
 
 TEST(TFCompileTest, AddWithCkptSaver) {
   AddWithCkptSaverComp add;
-  EXPECT_EQ(add.arg0_data(), add.args()[0]);
+  EXPECT_EQ(add.arg0_data(), add.arg_data(0));
 
   add.arg0() = 1;
   EXPECT_TRUE(add.Run());
@@ -145,7 +145,7 @@
   EXPECT_EQ(add_const.error_msg(), "");
   EXPECT_EQ(add_const.arg0(), 111);
   EXPECT_EQ(add_const.arg0_data()[0], 111);
-  EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]);
+  EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0));
   EXPECT_EQ(add_const.result0(), 153);
   EXPECT_EQ(add_const.result0_data()[0], 153);
   EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
@@ -153,9 +153,9 @@
 
 TEST(TFCompileTest, Cond) {
   CondComp cond;
-  EXPECT_EQ(cond.arg0_data(), cond.args()[0]);
-  EXPECT_EQ(cond.arg1_data(), cond.args()[1]);
-  EXPECT_EQ(cond.arg2_data(), cond.args()[2]);
+  EXPECT_EQ(cond.arg0_data(), cond.arg_data(0));
+  EXPECT_EQ(cond.arg1_data(), cond.arg_data(1));
+  EXPECT_EQ(cond.arg2_data(), cond.arg_data(2));
   cond.arg1() = 10;
   cond.arg2() = 20;
   {
@@ -178,8 +178,8 @@
 
 TEST(TFCompileTest, Gather) {
   GatherComp gather;
-  EXPECT_EQ(gather.arg0_data(), gather.args()[0]);
-  EXPECT_EQ(gather.arg1_data(), gather.args()[1]);
+  EXPECT_EQ(gather.arg0_data(), gather.arg_data(0));
+  EXPECT_EQ(gather.arg1_data(), gather.arg_data(1));
 
   // Successful gather.
   {
@@ -202,12 +202,12 @@
       EXPECT_EQ(gather_const.arg0(i), params[i]);
       EXPECT_EQ(gather_const.arg0_data()[i], params[i]);
     }
-    EXPECT_EQ(gather_const.arg0_data(), gather_const.args()[0]);
+    EXPECT_EQ(gather_const.arg0_data(), gather_const.arg_data(0));
     for (int i = 0; i < 2; ++i) {
       EXPECT_EQ(gather_const.arg1(i), indices[i]);
       EXPECT_EQ(gather_const.arg1_data()[i], indices[i]);
     }
-    EXPECT_EQ(gather_const.arg1_data(), gather_const.args()[1]);
+    EXPECT_EQ(gather_const.arg1_data(), gather_const.arg_data(1));
     for (int i = 0; i < 2; ++i) {
       EXPECT_EQ(gather_const.result0(i), results[i]);
       EXPECT_EQ(gather_const.result0_data()[i], results[i]);
@@ -222,8 +222,8 @@
 
   foo::bar::MatMulComp matmul;
   matmul.set_thread_pool(&device);
-  EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]);
-  EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]);
+  EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0));
+  EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1));
 
   // Test using the argN() methods.
   {
@@ -271,12 +271,12 @@
       EXPECT_EQ(matmul_const.arg0(i / 3, i % 3), args[i]);
       EXPECT_EQ(matmul_const.arg0_data()[i], args[i]);
     }
-    EXPECT_EQ(matmul_const.arg0_data(), matmul.args()[0]);
+    EXPECT_EQ(matmul_const.arg0_data(), matmul.arg_data(0));
     for (int i = 0; i < 6; ++i) {
       EXPECT_EQ(matmul_const.arg1(i / 2, i % 2), args[i + 6]);
       EXPECT_EQ(matmul_const.arg1_data()[i], args[i + 6]);
     }
-    EXPECT_EQ(matmul_const.arg1_data(), matmul.args()[1]);
+    EXPECT_EQ(matmul_const.arg1_data(), matmul.arg_data(1));
     for (int i = 0; i < 4; ++i) {
       EXPECT_EQ(matmul_const.result0(i / 2, i % 2), results[i]);
       EXPECT_EQ(matmul_const.result0_data()[i], results[i]);
@@ -300,8 +300,8 @@
   float arg1[3][2] = {{7, 8}, {9, 10}, {11, 12}};
   matmul.set_arg0_data(&arg0);
   matmul.set_arg1_data(&arg1);
-  EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]);
-  EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]);
+  EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0));
+  EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1));
 
   EXPECT_TRUE(matmul.Run());
   EXPECT_EQ(matmul.error_msg(), "");
@@ -319,8 +319,8 @@
 
   MatMulAndAddComp muladd;
   muladd.set_thread_pool(&device);
-  EXPECT_EQ(muladd.arg0_data(), muladd.args()[0]);
-  EXPECT_EQ(muladd.arg1_data(), muladd.args()[1]);
+  EXPECT_EQ(muladd.arg0_data(), muladd.arg_data(0));
+  EXPECT_EQ(muladd.arg1_data(), muladd.arg_data(1));
 
   // Test methods with positional args and results.
   {
@@ -346,12 +346,12 @@
       EXPECT_EQ(muladd_const.arg0(i / 2, i % 2), args[i]);
       EXPECT_EQ(muladd_const.arg0_data()[i], args[i]);
     }
-    EXPECT_EQ(muladd_const.arg0_data(), muladd.args()[0]);
+    EXPECT_EQ(muladd_const.arg0_data(), muladd.arg_data(0));
     for (int i = 0; i < 4; ++i) {
       EXPECT_EQ(muladd_const.arg1(i / 2, i % 2), args[i + 4]);
       EXPECT_EQ(muladd_const.arg1_data()[i], args[i + 4]);
     }
-    EXPECT_EQ(muladd_const.arg1_data(), muladd.args()[1]);
+    EXPECT_EQ(muladd_const.arg1_data(), muladd.arg_data(1));
     for (int i = 0; i < 4; ++i) {
       EXPECT_EQ(muladd_const.result0(i / 2, i % 2), results0[i]);
       EXPECT_EQ(muladd_const.result0_data()[i], results0[i]);
@@ -387,12 +387,12 @@
       EXPECT_EQ(muladd_const.arg_x(i / 2, i % 2), args[i]);
       EXPECT_EQ(muladd_const.arg_x_data()[i], args[i]);
     }
-    EXPECT_EQ(muladd_const.arg_x_data(), muladd.args()[0]);
+    EXPECT_EQ(muladd_const.arg_x_data(), muladd.arg_data(0));
     for (int i = 0; i < 4; ++i) {
       EXPECT_EQ(muladd_const.arg_y(i / 2, i % 2), args[i + 4]);
       EXPECT_EQ(muladd_const.arg_y_data()[i], args[i + 4]);
     }
-    EXPECT_EQ(muladd_const.arg_y_data(), muladd.args()[1]);
+    EXPECT_EQ(muladd_const.arg_y_data(), muladd.arg_data(1));
     for (int i = 0; i < 4; ++i) {
       EXPECT_EQ(muladd_const.result_x_y_prod(i / 2, i % 2), results0[i]);
       EXPECT_EQ(muladd_const.result_x_y_prod_data()[i], results0[i]);
@@ -407,8 +407,8 @@
 TEST(TFCompileTest, Function) {
   // The function is equivalent to an addition
   FunctionComp add_fn;
-  EXPECT_EQ(add_fn.arg0_data(), add_fn.args()[0]);
-  EXPECT_EQ(add_fn.arg1_data(), add_fn.args()[1]);
+  EXPECT_EQ(add_fn.arg0_data(), add_fn.arg_data(0));
+  EXPECT_EQ(add_fn.arg1_data(), add_fn.arg_data(1));
 
   add_fn.arg0() = 1;
   add_fn.arg1() = 2;
@@ -451,8 +451,8 @@
   // Assert is converted into a no-op in XLA, so there is no failure even if the
   // two args are different.
   AssertComp assert;
-  EXPECT_EQ(assert.arg0_data(), assert.args()[0]);
-  EXPECT_EQ(assert.arg1_data(), assert.args()[1]);
+  EXPECT_EQ(assert.arg0_data(), assert.arg_data(0));
+  EXPECT_EQ(assert.arg1_data(), assert.arg_data(1));
 
   assert.arg0() = 2;
   assert.arg1() = 1;
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index d3238c6..e059f77 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -160,6 +160,7 @@
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:util",
@@ -178,6 +179,7 @@
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:fifo_queue",
+        "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:no_op",
@@ -186,6 +188,9 @@
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/kernels/data:generator_dataset_op",
+        "//tensorflow/core/kernels/data:iterator_ops",
+        "//tensorflow/core/kernels/data:prefetch_dataset_op",
     ],
 )
 
@@ -309,12 +314,16 @@
         "deadness_analysis_internal.h",
         "encapsulate_subgraphs_pass.cc",
         "mark_for_compilation_pass.cc",
+        "mark_for_compilation_pass_test_helper.cc",
+        "partially_decluster_pass.cc",
     ],
     hdrs = [
         "build_xla_launch_ops_pass.h",
         "deadness_analysis.h",
         "encapsulate_subgraphs_pass.h",
         "mark_for_compilation_pass.h",
+        "mark_for_compilation_pass_test_helper.h",
+        "partially_decluster_pass.h",
     ],
     deps = [
         ":common",
@@ -349,6 +358,7 @@
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:bounds_check",
     ],
@@ -413,10 +423,12 @@
     srcs = [
         "encapsulate_subgraphs_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
+        "partially_decluster_pass_test.cc",
     ],
     deps = [
         ":common",
         ":compilation_passes",
+        ":xla_cluster_util",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 8aff87e..309aeff 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -21,18 +21,79 @@
 #include "tensorflow/core/lib/hash/hash.h"
 
 // ALGORITHM OVERVIEW
+// ==================
 //
 // We map every output produced by each node in the TensorFlow graph (including
 // control dependence) into an instance of the Predicate class.  Instances of
 // Predicate denote logical formulas and mapping a node `n` to a predicate
-// `pred` implies that `n` is executed whenver `pred` is true.  Then we can
-// deduce mismatching liveness in the inputs to node by comparing the predicate
-// those inputs are mapped to.
+// `pred` implies that `n` is live whenever `pred` is true.  Then we can deduce
+// mismatching liveness in the inputs to node by comparing the predicate those
+// inputs are mapped to.  The core logic of this pass resides in creating the
+// map from TensorFlow nodes to predicates.
 //
-// Loops are handled pessimistically -- we map Merge nodes with backedges to
-// uninterpreted symbols (the same kind we use to represent Switch and _Recv).
-// Predicate equality has to hold over all possible assignments to these
-// uninterpreted symbols.
+//
+// MAPPING NODES TO PREDICATES, MODULO CYCLES
+// ------------------------------------------
+//
+// If we ignore cycles for a moment, computing predicates is fairly
+// straightforward.  We traverse the graph in RPO, mapping each node to a
+// predicate based on the predicates its inputs are mapped to.  For instance a
+// Merge(X, Y) node will be mapped to OR(PredicateFor(X), PredicateFor(Y)).
+// Roughtly speaking, we abstract interpret each node on the "liveness" domain,
+// where values in the domain represent if a tensor carries a dead signal or
+// not.
+//
+//
+// DEALING WITH CYCLES
+// -------------------
+//
+// We map Merge nodes that are the target of a backedge to AndRecurrence
+// instances.  An AndRecurrence with start() = S and step() = X, printed as
+// {S,&,X}, *roughly* represents the infinite list of predicates
+// [S,S&X,S&X&X,S&X&X, ...].  So {S,&,X} can be used to represent the predicate
+// for Merge in a graph like:
+//
+//     Init
+//       |
+//       v
+//     Merge <-----------+
+//       |               |
+//       v               |
+//      Incr             |
+//       |               |
+//       v               |
+//      Switch <- Cond   |
+//       |               |
+//       v (oidx: 1)     |
+//       |               |
+//       +---------------+
+//
+// Where S is the predicate for Init and X is the predicate that asserts that
+// Cond is true.  {S,&,X} states that Merge is live on the first "iteration" iff
+// S is true, live on the second iteration iff "S&X" is true, live on the third
+// iteration iff "S&X&X" is true etc.  There is a subtlety here, S&X&X would
+// normally be equivalent to S&X which isn't quite what we want to represent.
+// Instead we want {S,&,X} to denote the infinite list [S, S&X,
+// S&X&X',S&X&X'&X'', ...] where X, X', X'' are predicates that assert Cond is
+// true on iteration 0, 1, 2 respectively.  This is made more precise in the
+// comment on the AndRecurrence class.
+//
+// The general algorithm that deals with cycles does two RPO (reverse post
+// order) passes over the graph.  On the first pass it assigns a symbolic
+// predicate to merge nodes with backedges.  On the second pass it tries to
+// pattern matche the predicates for the backedges of these merges and infer an
+// AndRecurrence for the merge.
+//
+// In other words, we do a pessimistic data flow analysis where the data-flow
+// lattice has two elements, Symbolic and NonSymbolic with Symbolic >
+// NonSymbolic. The lattice has height = 2 so two iterations are sufficient to
+// converge.  We don't do an optimistic data flow analysis to make pattern
+// matching easier: if we assigned the predicate of the initial value to the
+// merge during the first pass, on the second pass the backedge may see a
+// simplified value that would be difficult to pattern match.
+//
+// We still use symbolic predicates for merges for which we can't pattern match
+// on the backedge predicate.  This is conservatively correct.
 
 namespace tensorflow {
 
@@ -42,14 +103,21 @@
 // above.
 class Predicate {
  public:
-  enum class Kind { kAnd, kOr, kNot, kSymbol };
+  enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol };
 
   virtual string ToString() const = 0;
   int64 hash() const { return hash_; }
+  virtual gtl::ArraySlice<Predicate*> GetOperands() const = 0;
 
   virtual Kind kind() const = 0;
   virtual ~Predicate() {}
 
+  // Invokes func on p and on all of its operands recursively.  Does not invoke
+  // `func` on the same Predicate instance twice.  Aborts the search if `func`
+  // returns true.
+  template <typename FunctionTy>
+  static void Visit(Predicate* p, const FunctionTy& func);
+
  protected:
   explicit Predicate(int64 hash) : hash_(hash) {}
 
@@ -90,7 +158,8 @@
 
   Kind kind() const override { return Kind::kAnd; }
 
-  const gtl::ArraySlice<Predicate*> operands() const { return operands_; }
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return operands_; }
+  gtl::ArraySlice<Predicate*> operands() const { return operands_; }
 
  private:
   std::vector<Predicate*> operands_;
@@ -117,7 +186,8 @@
   }
 
   Kind kind() const override { return Kind::kOr; }
-  const gtl::ArraySlice<Predicate*> operands() const { return operands_; }
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return operands_; }
+  gtl::ArraySlice<Predicate*> operands() const { return operands_; }
 
  private:
   std::vector<Predicate*> operands_;
@@ -128,23 +198,58 @@
  public:
   explicit NotPredicate(Predicate* operand)
       : Predicate(HashPredicateSequence(Kind::kNot, {operand})),
-        operand_(operand) {}
+        operands_({operand}) {}
 
   string ToString() const override {
     return strings::StrCat("~", operand()->ToString());
   }
 
   Kind kind() const override { return Kind::kNot; }
-  Predicate* operand() const { return operand_; }
+  Predicate* operand() const { return operands_[0]; }
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return operands_; }
 
  private:
-  Predicate* operand_;
+  std::array<Predicate*, 1> operands_;
+};
+
+// Represents an infinite list of predicates.
+//
+// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands
+// for the list of predicates:
+//
+//   S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ...
+//
+// where GenSym(<expression>, <id>) renames every SymbolPredicate in
+// <expression> by appending <id> to it, in effect creating a "fresh" symbol.
+// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on
+// subsequent iterations".
+class AndRecurrencePredicate : public Predicate {
+ public:
+  explicit AndRecurrencePredicate(Predicate* start, Predicate* step)
+      : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})),
+        operands_({start, step}) {}
+
+  Predicate* start() const { return operands_[0]; }
+  Predicate* step() const { return operands_[1]; }
+
+  string ToString() const override {
+    return strings::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
+                           "}");
+  }
+
+  Kind kind() const override { return Kind::kAndRecurrence; }
+
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return operands_; }
+
+ private:
+  std::array<Predicate*, 2> operands_;
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
 //
 // Two predicates are equivalent iff they are equivalent for all assignments to
-// the symbols contained in them.
+// the symbols contained in them, i.e. predicates are forall qualified over
+// symbols.
 class SymbolPredicate : public Predicate {
  public:
   explicit SymbolPredicate(TensorId tensor_id, bool must_be_true)
@@ -158,6 +263,7 @@
   }
 
   Kind kind() const override { return Kind::kSymbol; }
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return {}; }
 
   // If `must_be_true()` is true this SymbolPredicate represents the proposition
   // "tensor_id() is live and evaluates to true".
@@ -179,6 +285,29 @@
   }
 };
 
+template <typename FunctionTy>
+/*static*/ void Predicate::Visit(Predicate* p, const FunctionTy& func) {
+  gtl::FlatSet<Predicate*> visited;
+  std::vector<Predicate*> stack;
+
+  stack.push_back(p);
+  visited.insert(p);
+
+  while (!stack.empty()) {
+    Predicate* current = stack.back();
+    stack.pop_back();
+    bool done = func(current);
+    if (done) {
+      return;
+    }
+    for (Predicate* op : current->GetOperands()) {
+      if (visited.insert(op).second) {
+        stack.push_back(op);
+      }
+    }
+  }
+}
+
 // Creates and owns Predicate instances.  Simplifies predicates as it creates
 // them.
 class PredicateFactory {
@@ -204,6 +333,21 @@
     }
   }
 
+  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) {
+    auto it = interned_and_rec_instances_.find({start, step});
+    if (it != interned_and_rec_instances_.end()) {
+      return it->second.get();
+    }
+
+    std::unique_ptr<Predicate> new_pred =
+        Make<AndRecurrencePredicate>(start, step);
+    Predicate* new_pred_ptr = new_pred.get();
+    CHECK(interned_and_rec_instances_
+              .emplace(SignatureForAndRec(start, step), std::move(new_pred))
+              .second);
+    return new_pred_ptr;
+  }
+
   Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
     SignatureForSymbol signature = {tensor_id, must_be_true};
     auto it = interned_symbol_instances_.find(signature);
@@ -244,6 +388,7 @@
   using SignatureForAndOr =
       std::pair<Predicate::Kind, gtl::ArraySlice<Predicate*>>;
   using SignatureForNot = Predicate*;
+  using SignatureForAndRec = std::pair<Predicate*, Predicate*>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
 
   struct HashSignatureForAndOr {
@@ -268,6 +413,8 @@
       interned_and_or_instances_;
   gtl::FlatMap<SignatureForNot, std::unique_ptr<Predicate>>
       interned_not_instances_;
+  gtl::FlatMap<SignatureForAndRec, std::unique_ptr<Predicate>>
+      interned_and_rec_instances_;
   gtl::FlatMap<SignatureForSymbol, std::unique_ptr<Predicate>,
                HashSignatureForSymbol>
       interned_symbol_instances_;
@@ -288,10 +435,7 @@
 
     if (op->kind() == pred_kind) {
       // "Inline" the operands of an inner And/Or into the parent And/Or.
-      gtl::ArraySlice<Predicate*> operands =
-          is_and ? dynamic_cast<AndPredicate*>(op)->operands()
-                 : dynamic_cast<OrPredicate*>(op)->operands();
-      for (Predicate* subop : operands) {
+      for (Predicate* subop : op->GetOperands()) {
         if (simplified_ops_set.insert(subop).second) {
           simplified_ops.push_back(subop);
         }
@@ -351,6 +495,7 @@
       : graph_(*graph), vlog_(VLOG_IS_ON(2)) {}
 
   Status Populate();
+  Status PopulateWithReversePostOrder(gtl::ArraySlice<Node*> rpo);
   bool HasInputsWithMismatchingDeadness(const Node& node) override;
   void Print() const override;
   gtl::FlatMap<TensorId, string, TensorId::Hasher> PredicateMapAsString() const;
@@ -359,20 +504,40 @@
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
 
   std::vector<Predicate*> GetIncomingPreds(Node* n, EdgeKind edge_kind);
-  void SetPred(Node* n, int output_idx, Predicate* pred) {
-    CHECK(
-        predicate_map_.insert({TensorId(n->name(), output_idx), pred}).second);
-  }
-  void SetPred(Node* n, gtl::ArraySlice<int> output_idxs, Predicate* pred) {
-    for (int output_idx : output_idxs) {
-      SetPred(n, output_idx, pred);
+
+  // Sets the predicate for output `output_idx` of `n` to `pred`.  Sets the i'th
+  // bit of `should_revisit` if `pred` is different from the current predicate
+  // for the `output_idx` output of `n`.
+  void SetPred(Node* n, int output_idx, Predicate* pred,
+               std::vector<bool>* should_revisit) {
+    auto insert_result =
+        predicate_map_.insert({TensorId(n->name(), output_idx), pred});
+    if (!insert_result.second && insert_result.first->second != pred) {
+      VLOG(4) << "For " << n->name() << ":" << output_idx << " from "
+              << insert_result.first->second->ToString() << " "
+              << insert_result.first->second << " to " << pred->ToString()
+              << " " << pred;
+      insert_result.first->second = pred;
+      if (should_revisit != nullptr) {
+        for (const Edge* e : n->out_edges()) {
+          (*should_revisit)[e->dst()->id()] = true;
+        }
+      }
     }
   }
 
-  Status HandleSwitch(Node* n);
-  Status HandleMerge(Node* n);
-  Status HandleRecv(Node* n);
-  Status HandleGeneric(Node* n);
+  void SetPred(Node* n, gtl::ArraySlice<int> output_idxs, Predicate* pred,
+               std::vector<bool>* should_revisit) {
+    for (int output_idx : output_idxs) {
+      SetPred(n, output_idx, pred, should_revisit);
+    }
+  }
+
+  Status HandleSwitch(Node* n, std::vector<bool>* should_revisit);
+  Status HandleMerge(Node* n, std::vector<bool>* should_revisit);
+  Status HandleRecv(Node* n, std::vector<bool>* should_revisit);
+  Status HandleGeneric(Node* n, std::vector<bool>* should_revisit);
+  Status HandleNode(Node* n, std::vector<bool>* should_revisit);
 
   const Graph& graph_;
   gtl::FlatMap<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
@@ -395,14 +560,15 @@
 
     if (should_process) {
       auto it = predicate_map_.find(InputEdgeToTensorId(in_edge));
-      CHECK(it != predicate_map_.end());
+      CHECK(it != predicate_map_.end()) << n->name();
       incoming_preds.push_back(it->second);
     }
   }
   return incoming_preds;
 }
 
-Status DeadnessAnalysisImpl::HandleSwitch(Node* n) {
+Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
+                                          std::vector<bool>* should_revisit) {
   std::vector<Predicate*> input_preds =
       GetIncomingPreds(n, EdgeKind::kDataAndControl);
   const Edge* pred_edge;
@@ -414,42 +580,153 @@
 
   // Output 0 is alive iff all inputs are alive and the condition is false.
   input_preds.push_back(false_switch);
-  SetPred(n, 0, predicate_factory_.MakeAndPredicate(input_preds));
+  SetPred(n, 0, predicate_factory_.MakeAndPredicate(input_preds),
+          should_revisit);
   input_preds.pop_back();
 
   // Output 1 is alive iff all inputs are alive and the condition is true.
   input_preds.push_back(true_switch);
-  SetPred(n, 1, predicate_factory_.MakeAndPredicate(input_preds));
+  SetPred(n, 1, predicate_factory_.MakeAndPredicate(input_preds),
+          should_revisit);
   input_preds.pop_back();
 
-  // Control is alive iff any inputs are alive.
+  // Control is alive iff all inputs are alive.
   SetPred(n, Graph::kControlSlot,
-          predicate_factory_.MakeAndPredicate(input_preds));
+          predicate_factory_.MakeAndPredicate(input_preds), should_revisit);
 
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleMerge(Node* n) {
+namespace {
+const Edge* FindUniqueBackedge(Node* merge) {
+  CHECK(merge->IsMerge());
+  const Edge* result = nullptr;
+  for (const Edge* e : merge->in_edges()) {
+    if (e->src()->IsNextIteration()) {
+      CHECK_EQ(result, nullptr)
+          << "Multiple backedges to " << merge->DebugString();
+      result = e;
+    }
+  }
+  return result;
+}
+
+// If `backedge_predicate` is equal to `symbolic_predicate` & Step where Step
+// does not contain `symbolic_predicate` as an inner (not top-level) operand
+// then returns `Step`.  Otherwise returns nullptr.
+Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
+                               Predicate* symbolic_predicate,
+                               Predicate* backedge_predicate) {
+  CHECK(dynamic_cast<SymbolPredicate*>(symbolic_predicate));
+  if (backedge_predicate->kind() != Predicate::Kind::kAnd) {
+    return nullptr;
+  }
+
+  std::vector<Predicate*> and_ops;
+  gtl::ArraySlice<Predicate*> recurrent_pred_ops =
+      backedge_predicate->GetOperands();
+
+  bool found_sym = false;
+  for (Predicate* and_op : recurrent_pred_ops) {
+    // We want the `symbol_predicate` to be the one of the operands of
+    // `backedge_predicate`,
+    if (and_op == symbolic_predicate) {
+      found_sym = true;
+      continue;
+    }
+
+    // but we don't want it to be present anywhere else in the formula.  E.g. we
+    // don't want the recurrent predicate to be
+    // symbol_predicate&(X|symbol_predicate).
+    bool found_sym_as_inner_operand = false;
+    auto has_self_as_inner_operand = [&](Predicate* p) {
+      if (p == symbolic_predicate) {
+        found_sym_as_inner_operand = true;
+        return true;  // Stop searching, we're done.
+      }
+
+      // Continue searching.
+      return false;
+    };
+
+    Predicate::Visit(and_op, has_self_as_inner_operand);
+    if (found_sym_as_inner_operand) {
+      return nullptr;
+    }
+    and_ops.push_back(and_op);
+  }
+
+  return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr;
+}
+}  // namespace
+
+Status DeadnessAnalysisImpl::HandleMerge(Node* n,
+                                         std::vector<bool>* should_revisit) {
   // Merge ignores deadness of its control inputs.  A merge that isn't the
-  // target of a backedge has is alive iff any of its data inputs are.  We treat
-  // the liveness of a merge that is the target of a backedge symbolically.
+  // target of a backedge has is alive iff any of its data inputs are.  The
+  // liveness of a merge that is the target of a backedge can sometimes be
+  // represented using a AndRecurrencePredicate.  If neither apply, we represent
+  // the liveness of the merge symbolically.
 
-  bool has_backedge = std::any_of(
-      n->in_edges().begin(), n->in_edges().end(), [](const Edge* e) {
-        return !e->IsControlEdge() && e->src()->IsNextIteration();
-      });
+  bool has_unvisited_backedge = false;
+  for (const Edge* e : n->in_edges()) {
+    if (!e->IsControlEdge() && e->src()->IsNextIteration()) {
+      has_unvisited_backedge |= !predicate_map_.count(InputEdgeToTensorId(e));
+    }
+  }
 
-  Predicate* input_data_pred =
-      has_backedge ? predicate_factory_.MakeSymbolPredicate(
-                         TensorId(n->name(), 0), /*must_be_true=*/false)
-                   : predicate_factory_.MakeOrPredicate(
-                         GetIncomingPreds(n, EdgeKind::kDataOnly));
+  auto it = predicate_map_.find(TensorId(n->name(), 0));
+  if (it == predicate_map_.end()) {
+    if (has_unvisited_backedge) {
+      // We're visiting this merge for the first time and it has an unvisited
+      // backedge.
+      Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate(
+          TensorId(n->name(), 0), /*must_be_true=*/false);
+      SetPred(n, {0, 1, Graph::kControlSlot}, input_data_pred, should_revisit);
+      return Status::OK();
+    }
 
-  SetPred(n, {0, 1, Graph::kControlSlot}, input_data_pred);
+    // We're visiting this merge for the first time and it is a acyclic merge.
+    Predicate* input_data_pred = predicate_factory_.MakeOrPredicate(
+        GetIncomingPreds(n, EdgeKind::kDataOnly));
+    SetPred(n, {0, 1, Graph::kControlSlot}, input_data_pred, should_revisit);
+    return Status::OK();
+  }
+
+  if (it->second->kind() == Predicate::Kind::kSymbol) {
+    // Last time we visited this merge we only got a symbolic predicate because
+    // of an unvisited backedge.  Try to pattern match the predicate expression
+    // for that backedge (which should be visited now) into an and recurrence
+    // for the merge node.
+    if (const Edge* unique_backedge = FindUniqueBackedge(n)) {
+      if (Predicate* step = DeduceStepPredicate(
+              &predicate_factory_, it->second,
+              predicate_map_[InputEdgeToTensorId(unique_backedge)])) {
+        // If the predicate for the backedge is "Sym&X" where "Sym" is the
+        // predicate for the merge then the merge has predicate {S,&,X} where S
+        // is the predicate for the merge ignoring the backedge.
+        std::vector<Predicate*> non_recurrent_inputs;
+        for (const Edge* e : n->in_edges()) {
+          if (e != unique_backedge) {
+            non_recurrent_inputs.push_back(
+                predicate_map_[InputEdgeToTensorId(e)]);
+          }
+        }
+
+        Predicate* start =
+            predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
+        Predicate* and_rec =
+            predicate_factory_.MakeAndRecurrencePredicate(start, step);
+        SetPred(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit);
+        return Status::OK();
+      }
+    }
+  }
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleRecv(Node* n) {
+Status DeadnessAnalysisImpl::HandleRecv(Node* n,
+                                        std::vector<bool>* should_revisit) {
   // In addition to being alive or dead based on the inputs, a _Recv can also
   // acquire a dead signal from a _Send.
   std::vector<Predicate*> input_preds =
@@ -457,18 +734,37 @@
   input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
       TensorId(n->name(), 0), /*must_be_true=*/false));
   SetPred(n, {0, Graph::kControlSlot},
-          predicate_factory_.MakeAndPredicate(input_preds));
+          predicate_factory_.MakeAndPredicate(input_preds), should_revisit);
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleGeneric(Node* n) {
+Status DeadnessAnalysisImpl::HandleGeneric(Node* n,
+                                           std::vector<bool>* should_revisit) {
   // Generally nodes are alive iff all their inputs are alive.
   Predicate* pred = predicate_factory_.MakeAndPredicate(
       GetIncomingPreds(n, EdgeKind::kDataAndControl));
   for (int output_idx = 0; output_idx < n->num_outputs(); output_idx++) {
-    SetPred(n, output_idx, pred);
+    SetPred(n, output_idx, pred, should_revisit);
   }
-  SetPred(n, Graph::kControlSlot, pred);
+  SetPred(n, Graph::kControlSlot, pred, should_revisit);
+  return Status::OK();
+}
+
+Status DeadnessAnalysisImpl::HandleNode(Node* n,
+                                        std::vector<bool>* should_revisit) {
+  if (n->IsSwitch()) {
+    TF_RETURN_IF_ERROR(HandleSwitch(n, should_revisit));
+  } else if (n->IsMerge()) {
+    TF_RETURN_IF_ERROR(HandleMerge(n, should_revisit));
+  } else if (n->IsControlTrigger()) {
+    SetPred(n, Graph::kControlSlot, predicate_factory_.MakeTrue(), nullptr);
+  } else if (n->IsRecv() || n->IsHostRecv()) {
+    TF_RETURN_IF_ERROR(HandleRecv(n, should_revisit));
+  } else if (n->IsNextIteration()) {
+    TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit));
+  } else {
+    TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit));
+  }
   return Status::OK();
 }
 
@@ -478,20 +774,53 @@
                       /*edge_filter=*/[](const Edge& edge) {
                         return !edge.src()->IsNextIteration();
                       });
+  return PopulateWithReversePostOrder(rpo);
+}
 
+Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
+    gtl::ArraySlice<Node*> rpo) {
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
+  //
+  // We iterate over the graph twice, each time in RPO.  On the first iteration
+  // merge nodes with backedges are mapped to symbolic predicates.  On the
+  // second iteration we use the predicates assigned to the backedges in the
+  // previous iteration to infer a more precise predicate for the backedge merge
+  // nodes and all the nodes that transitively use it.
+  //
+  // We don't track the output indices for should_revisit.  Instead, putting a
+  // node in `should_revisit` denotes that the deadness flowing out from any
+  // output from said node may have changed.  This is fine; only switches
+  // propagate different deadness along different output edges, and since the
+  // delta is solely due to the input *values* (and not input deadness), the
+  // delta should not change in the second iteration.
+  std::vector<bool> should_revisit;
+  should_revisit.resize(graph_.num_node_ids());
   for (Node* n : rpo) {
-    if (n->IsSwitch()) {
-      TF_RETURN_IF_ERROR(HandleSwitch(n));
-    } else if (n->IsMerge()) {
-      TF_RETURN_IF_ERROR(HandleMerge(n));
-    } else if (n->IsControlTrigger()) {
-      SetPred(n, Graph::kControlSlot, predicate_factory_.MakeTrue());
-    } else if (n->IsRecv() || n->IsHostRecv()) {
-      TF_RETURN_IF_ERROR(HandleRecv(n));
-    } else {
-      TF_RETURN_IF_ERROR(HandleGeneric(n));
+    VLOG(4) << "Visiting " << n->name();
+    TF_RETURN_IF_ERROR(HandleNode(n, /*should_revisit=*/nullptr));
+    if (n->IsNextIteration()) {
+      // If this is a backedge for a merge node then remember to reprocess the
+      // merge the next time we run.
+      for (const Edge* e : n->out_edges()) {
+        if (e->dst()->IsMerge()) {
+          should_revisit[e->dst()->id()] = true;
+        }
+      }
+    }
+  }
+
+  for (Node* n : rpo) {
+    // The nodes added to should_revisit in the previous loop need to be
+    // revisited now.  Reprocesing these initial nodes may add *their* consumers
+    // to should_revisit, and these newly added nodes will also be processed by
+    // this very same loop.  Since we're traversing the graph in reverse post
+    // order (producers before consumers) and HandleNode(n) can only ever add
+    // n's consumers to should_revisit, we won't "miss" an addition to
+    // should_revisit.
+    if (should_revisit[n->id()]) {
+      VLOG(4) << "Revisiting " << n->name();
+      TF_RETURN_IF_ERROR(HandleNode(n, &should_revisit));
     }
   }
 
@@ -587,6 +916,15 @@
   *out_predicate_map = impl.PredicateMapAsString();
   return Status::OK();
 }
+
+Status ComputePredicates(const Graph& graph,
+                         gtl::ArraySlice<Node*> reverse_post_order,
+                         PredicateMapTy* out_predicate_map) {
+  DeadnessAnalysisImpl impl(&graph);
+  TF_RETURN_IF_ERROR(impl.PopulateWithReversePostOrder(reverse_post_order));
+  *out_predicate_map = impl.PredicateMapAsString();
+  return Status::OK();
+}
 }  // namespace deadness_analysis_internal
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index cdef405..401d6e4 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -26,6 +26,14 @@
 // testing purposes only.
 using PredicateMapTy = gtl::FlatMap<TensorId, string, TensorId::Hasher>;
 Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map);
+
+// Returns a map describing the predicate each Tensor was mapped to.  For
+// testing purposes only.  Makes deadness analysis visit the graph in the order
+// specified in `reverse_post_order` which must be a valid RPO for the graph
+// minus NextIteration->Merge edges.
+Status ComputePredicates(const Graph& graph,
+                         gtl::ArraySlice<Node*> reverse_post_order,
+                         PredicateMapTy* out_predicate_map);
 }  // namespace deadness_analysis_internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 6881095..cc9f102 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -38,6 +38,9 @@
 namespace tensorflow {
 namespace {
 
+using deadness_analysis_internal::ComputePredicates;
+using deadness_analysis_internal::PredicateMapTy;
+
 Status AnalyzeDeadness(Graph* graph,
                        std::unique_ptr<DeadnessAnalysis>* result) {
   FixupSourceAndSinkEdges(graph);
@@ -51,13 +54,73 @@
   return ops::Switch(root.WithOpName(prefix + "/switch"), value, predicate);
 }
 
-Output CreateInductionVariable(const Scope& root, const string& prefix,
-                               const string& frame_name, int32 init) {
-  Output initial_value = ops::Const(root.WithOpName(prefix + "/init"), init);
+TensorId ControlOutputFor(const Output& o) {
+  return {o.node()->name(), Graph::kControlSlot};
+}
+
+void VLogGraphIfAsked(const Graph& graph) {
+  if (VLOG_IS_ON(3)) {
+    GraphDef graph_def;
+    graph.ToGraphDef(&graph_def);
+    string serialized;
+    ::tensorflow::protobuf::TextFormat::PrintToString(graph_def, &serialized);
+    LOG(INFO) << serialized;
+  }
+}
+
+struct InductionVarInfo {
+  Output induction_var;
+  Output loop_cond;
+};
+
+// Creates an induction variable with the following structure (simplified for
+// brevity):
+//
+//            +---------------+
+//            | initial_value |
+//            +---------------+
+//              |
+//              |
+//              v
+//            +---------------+
+//            |     Enter     |
+//            +---------------+
+//              |
+//              |
+//              v
+//            +---------------+
+//         +> |     Merge     | -+
+//         |  +---------------+  |
+//         |    |                |
+//         |    |                |
+//         |    v                |
+//         |  +---------------+  |
+//         |  |  LessThan10   |  |
+//         |  +---------------+  |
+//         |    |                |
+//         |    |                |
+//         |    v                |
+//         |  +---------------+  |
+//    +----+- |    Switch     | <+
+//    |    |  +---------------+
+//    |    |    |
+//    |    |    |
+//    |    |    v
+//    |    |  +---------------+
+//    |    +- |    AddOne     |
+//    |       +---------------+
+//    |       +---------------+
+//    +-----> |     Exit      |
+//            +---------------+
+InductionVarInfo CreateInductionVariable(const Scope& root,
+                                         const string& prefix,
+                                         const string& frame_name,
+                                         const Output& initial_value) {
   Output enter_initial_value = ops::internal::Enter(
       root.WithOpName(prefix + "/enter"), initial_value, frame_name);
 
-  ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_initial_value});
+  ops::Merge iv(root.WithOpName(prefix + "/iv"),
+                {enter_initial_value, enter_initial_value});
   Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1);
   Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10);
   Output loop_cond_expr =
@@ -66,16 +129,84 @@
       ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
   ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
-  Output iv_next =
-      ops::Add(root.WithOpName(prefix + "/ivnext"), iv.output, increment_by);
+  Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
+                            latch.output_true, increment_by);
   Output next_iteration =
-      ops::NextIteration(root.WithOpName(prefix + "next_iteration"), iv_next);
+      ops::NextIteration(root.WithOpName(prefix + "/next_iteration"), iv_next);
 
-  root.graph()->AddEdge(next_iteration.node(), 0, iv.output.node(), 1);
+  CHECK(root.graph()
+            ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1)
+            .ok());
   root.graph()->AddControlEdge(iv.output.node(), increment_by.node());
   root.graph()->AddControlEdge(iv.output.node(), final_value.node());
 
-  return iv.output;
+  return {iv.output, loop_cond};
+}
+
+InductionVarInfo CreateInductionVariable(const Scope& root,
+                                         const string& prefix,
+                                         const string& frame_name, int32 init) {
+  return CreateInductionVariable(
+      root, prefix, frame_name,
+      ops::Const(root.WithOpName(prefix + "/init"), init));
+}
+
+// Creates an induction variable with the following structure:
+//
+//                           +---------------+
+//                           | initial_value |
+//                           +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Enter     |
+//                           +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Merge     | <+
+//                           +---------------+  |
+//                             |                |
+//                             |                |
+//                             v                |
+//         +-----------+     +---------------+  |
+//         | loop_cond | --> |    Switch     | -+
+//         +-----------+     +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Exit      |
+//                           +---------------+
+struct DependentInductionVar {
+  Output induction_var;
+  ops::Switch latch;
+};
+
+DependentInductionVar CreateDependentLoopInvariantValue(
+    const Scope& root, const string& prefix, const string& frame_name,
+    const Output& loop_cond, const Output& value) {
+  Output enter_value = ops::internal::Enter(root.WithOpName(prefix + "/enter"),
+                                            value, frame_name);
+  ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value});
+  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  Output next_iteration = ops::NextIteration(
+      root.WithOpName(prefix + "/next_iteration"), latch.output_true);
+  CHECK(root.graph()
+            ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1)
+            .ok());
+  return {iv.output, latch};
+}
+
+DependentInductionVar CreateDependentLoopInvariantValue(
+    const Scope& root, const string& prefix, const string& frame_name,
+    const Output& loop_cond, int32 value) {
+  return CreateDependentLoopInvariantValue(
+      root, prefix, frame_name, loop_cond,
+      ops::Const(root.WithOpName(prefix + "/init"), value));
 }
 
 TEST(DeadnessAnalysisTest, BasicPositive) {
@@ -337,21 +468,224 @@
 
 TEST(DeadnessAnalysisTest, Loop) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  Output iv0 = CreateInductionVariable(root, "iv0", "fr0", 0);
-  Output iv1 = CreateInductionVariable(root, "iv1", "fr0", 0);
-  Output iv2 = CreateInductionVariable(root, "iv2", "fr0", 1);
+  Output iv0 = CreateInductionVariable(root, "iv0", "fr0", 0).induction_var;
+  Output iv1 = CreateInductionVariable(root, "iv1", "fr0", 0).induction_var;
+  Output iv2 = CreateInductionVariable(root, "iv2", "fr0", 1).induction_var;
   Output add0 = ops::Add(root.WithOpName("add0"), iv0, iv1);
   Output add1 = ops::Add(root.WithOpName("add1"), iv1, iv2);
 
-  std::unique_ptr<DeadnessAnalysis> result;
-  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
-
   // NB!  iv0 and iv1 are equivalent and a smarter deadness analysis would have
   // noticed that.  Today we are pessimistic here because we assign an
   // uninterpreted symbol to merges with backedges.
 
-  EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
-  EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add1.node()));
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add1.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0
+    // produce the same deadness.  But we're not that smart today.
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
+              "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  Output dependent_iv0 =
+      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0)
+          .induction_var;
+  Output dependent_iv1 =
+      CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0)
+          .induction_var;
+  Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
+              "{#true,&,*iv0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+  }
+}
+
+TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
+  // Create a merge that "looks like" a loop but isn't really.  It has a value
+  // that does not depend on the merge on its backedge.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  DependentInductionVar dependent_iv =
+      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0);
+  FixupSourceAndSinkEdges(root.graph());
+
+  // To make deadness analysis think that dependent_iv is a loop we need an RPO
+  // that visits the merge before the backedge.  This is a legal RPO for
+  // deadness analysis since it ignores NextIteration->Merge edges during RPO.
+  // Right now dependent_iv has an edge from Merge to NextIteration so do the
+  // RPO with this edge in place.  Then remove this edge to get our test case.
+  std::vector<Node*> rpo;
+  GetReversePostOrder(*root.graph(), &rpo, /*stable_comparator=*/{},
+                      /*edge_filter=*/[](const Edge& edge) {
+                        return !edge.src()->IsNextIteration();
+                      });
+  TF_ASSERT_OK(root.graph()->UpdateEdge(
+      iv.induction_var.node(), 0, dependent_iv.latch.output_true.node(), 0));
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), rpo, &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv.induction_var)],
+              "div0/iv:0");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_outer =
+      CreateInductionVariable(root, "iv_outer", "frame", 0);
+  ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                          ops::Const(root.WithOpName("constant"), 5),
+                          iv_outer.loop_cond);
+  InductionVarInfo iv_inner = CreateInductionVariable(
+      root, "iv_inner", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner/enter"),
+                           inner_value.output_true, "frame_inner"));
+
+  Output dependent_outer_iv0 =
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame",
+                                        iv_outer.loop_cond, 0)
+          .induction_var;
+  Output dependent_outer_iv1 =
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame",
+                                        iv_outer.loop_cond, 0)
+          .induction_var;
+
+  Output dependent_inner_iv0 =
+      CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame",
+                                        iv_inner.loop_cond, dependent_outer_iv0)
+          .induction_var;
+  Output dependent_inner_iv1 =
+      CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame",
+                                        iv_inner.loop_cond, dependent_outer_iv1)
+          .induction_var;
+
+  Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0,
+                         dependent_inner_iv1);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
+              "{#true,&,*iv_outer/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
+              "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&,"
+              "*iv_inner/cond:0}");
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_outer_0 =
+      CreateInductionVariable(root, "iv_outer_0", "frame", 0);
+  ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"),
+                            ops::Const(root.WithOpName("constant"), 5),
+                            iv_outer_0.loop_cond);
+  InductionVarInfo iv_inner_0 = CreateInductionVariable(
+      root, "iv_inner_0", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner_0/enter"),
+                           inner_value_0.output_true, "frame_inner"));
+
+  InductionVarInfo iv_outer_1 =
+      CreateInductionVariable(root, "iv_outer_1", "frame", 1);
+  ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"),
+                                 ops::Const(root.WithOpName("constant"), 5),
+                                 iv_outer_1.loop_cond);
+  InductionVarInfo iv_inner_1 = CreateInductionVariable(
+      root, "iv_inner_1", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner_1/enter"),
+                           inner_init_value_1.output_true, "frame_inner"));
+  Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var,
+                         iv_inner_1.induction_var);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)],
+              "{#true,&,*iv_outer_0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)],
+              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
+              "*iv_inner_0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)],
+              "{#true,&,*iv_outer_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)],
+              "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
+              "*iv_inner_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
+              "*iv_inner_1/cond:0} & "
+              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
+              "*iv_inner_0/cond:0})");
+  }
 }
 
 TEST(DeadnessAnalysisTest, ControlInputs) {
@@ -454,9 +788,8 @@
   std::unique_ptr<DeadnessAnalysis> result;
   TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
 
-  deadness_analysis_internal::PredicateMapTy predicate_map;
-  TF_ASSERT_OK(deadness_analysis_internal::ComputePredicates(*root.graph(),
-                                                             &predicate_map));
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
   TensorId logical_and_output_0 = {logical_and.node()->name(),
                                    Graph::kControlSlot};
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index fdd71c6..f150bf1 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1161,8 +1161,7 @@
         strings::StrCat("replace_encapsulate_fdef_", name), fdef);
   }
 
-  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
-  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  TF_RETURN_IF_ERROR(library->ReplaceFunction(name, fdef));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 4d49a14..c37b611 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/jit/build_xla_launch_ops_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
@@ -23,15 +24,18 @@
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+                      PartiallyDeclusterPass);
+
 // The EncapsulateSubgraphs pass must run after the MarkForCompilationPass. We
 // also need to run it after the graph been rewritten to have _Send nodes added
 // for fetches. Before the _Send nodes are added, fetch nodes are identified by
 // name, and encapsulation might remove that node from the graph.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
                       EncapsulateSubgraphsPass);
 
 // Must run after EncapsulateSubgraphsPass.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 40,
                       BuildXlaLaunchOpsPass);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 00a6f40..8f78c11 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -16,6 +16,7 @@
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_launch_util",
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index b313d48..7f4370b 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -199,7 +200,7 @@
   run_options.set_stream(stream);
   run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
-  run_options.set_rng_seed(ctx->step_id());
+  run_options.set_rng_seed(GetXLARandomSeed());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
@@ -209,7 +210,8 @@
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  launch_context.PopulateOutputs(ctx, kernel, run_result.ConsumeValueOrDie());
+  OP_REQUIRES_OK(ctx, launch_context.PopulateOutputs(
+                          ctx, kernel, run_result.ConsumeValueOrDie()));
   VLOG(1) << "Done";
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 45d4229..f4e179d 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -39,7 +39,9 @@
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -65,6 +67,7 @@
   // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
   // such nodes out of XLA clusters.
   if (HasForwardedRefInput(node)) {
+    VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast.";
     return false;
   }
 
@@ -84,14 +87,13 @@
 bool IsCompilableWhile(const Node& while_node,
                        const DeviceType& jit_device_type, int depth,
                        FunctionLibraryRuntime* lib_runtime) {
-  VLOG(2) << "Loop marking: " << while_node.type_string();
-
   const NameAttrList* name_attr;
   NodeDef call;
   Status status;
   status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
   if (!status.ok()) {
-    VLOG(2) << "Missing 'cond' attribute on While node.";
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": missing 'cond' attribute on While node.";
     return false;
   }
   const string cond_func = name_attr->name();
@@ -99,12 +101,14 @@
   call.set_op(cond_func);
   *call.mutable_attr() = name_attr->attr();
   if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) {
-    VLOG(2) << "Can't compile loop condition: " << cond_func;
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": can't compile loop condition: " << cond_func;
     return false;
   }
   status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
   if (!status.ok()) {
-    VLOG(2) << "Missing 'body' attribute on While node.";
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": missing 'body' attribute on While node.";
     return false;
   }
   const string body_func = name_attr->name();
@@ -112,10 +116,10 @@
   call.set_op(body_func);
   *call.mutable_attr() = name_attr->attr();
   if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) {
-    VLOG(2) << "Can't compile loop body: " << body_func;
+    VLOG(2) << "Rejecting While " << while_node.name()
+            << ": can't compile loop body: " << body_func;
     return false;
   }
-  VLOG(2) << "Loop is compilable.";
   return true;
 }
 
@@ -125,10 +129,9 @@
 bool IsCompilableCall(const NodeDef& call_def,
                       const DeviceType& jit_device_type, int depth,
                       FunctionLibraryRuntime* lib_runtime) {
-  VLOG(2) << "Function marking: " << call_def.op();
-
   if (depth > kMaxRecursionDepth) {
-    VLOG(2) << "Function depth limit exceeded";
+    VLOG(2) << "Rejecting " << call_def.op()
+            << ": function depth limit exceeded.";
     return false;
   }
 
@@ -136,9 +139,14 @@
   Status status =
       lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle);
   if (!status.ok()) {
-    VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status;
+    VLOG(2) << "Rejecting " << call_def.op()
+            << ": could not instantiate: " << status;
     return false;
   }
+
+  auto release_handle_on_return = gtl::MakeCleanup(
+      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   CHECK(fbody);
   const FunctionDef& fdef = fbody->fdef;
@@ -150,7 +158,8 @@
     // tf2xla to translate the TF graph into XLA.  So we avoid this for now.
     //
     // TODO(b/36139787): Create a mechanism to set inlining hints.
-    VLOG(2) << "Can't compile noinline function: " << fdef.DebugString();
+    VLOG(2) << "Rejecting " << call_def.op()
+            << ": can't compile noinline function.";
     return false;
   }
 
@@ -164,23 +173,14 @@
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, depth + 1,
                           lib_runtime)) {
-      VLOG(2) << "Function marking failed: unsupported op " << node->name()
-              << ": " << node->def().ShortDebugString();
+      VLOG(2) << "Rejecting " << call_def.op() << ": unsupported op "
+              << node->name() << ": " << node->def().ShortDebugString();
       return false;
     }
   }
-  VLOG(2) << "Function is compilable: " << call_def.op();
   return true;
 }
 
-// Tests whether `node` has a DT_RESOURCE typed input or output.
-bool HasResourceInputOrOutput(const Node& node) {
-  return std::find(node.input_types().begin(), node.input_types().end(),
-                   DT_RESOURCE) != node.input_types().end() ||
-         std::find(node.output_types().begin(), node.output_types().end(),
-                   DT_RESOURCE) != node.output_types().end();
-}
-
 // Returns true if the op can be decomposed into XLA ops for which
 // there are fusable elemental implementations.
 //
@@ -357,24 +357,27 @@
   }
   std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeComparatorID());
 
+  if (fuel >= std::numeric_limits<int64>::max() / 2) {
+    // The assumption is that if fuel started out as INT64_MAX, it will forever
+    // stay greater than INT64_MAX / 2.
+    VLOG(2) << "Starting fuel: infinity";
+  } else {
+    VLOG(2) << "Starting fuel: " << fuel;
+  }
+
   for (Node* node : sorted_nodes) {
-    VLOG(2) << "Fuel: " << fuel;
     if (fuel <= 0) {
-      VLOG(2)
+      VLOG(1)
           << "Hit fuel limit; not marking any remaining ops as clusterable.";
       break;
     }
 
-    VLOG(2) << "FindCompilationCandidates(): Processing "
-            << node->DebugString();
-
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
         DeviceToDeviceType(node->assigned_device_name(), &device_type));
 
     if (is_compilable_fn && !is_compilable_fn(node, device_type)) {
-      VLOG(2) << "Compilation rejected node: not compilable " << node->name()
-              << ": " << node->type_string();
+      // is_compilable_fn has already logged the reason if it returned false.
       continue;
     }
 
@@ -384,14 +387,14 @@
     DeviceType jit_device_type(registration->compilation_device_name);
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime)) {
-      VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
-              << ": " << node->type_string();
+      VLOG(2) << "Rejecting " << node->name() << ": unsupported op "
+              << node->type_string();
       continue;
     }
     if (!registration->compile_resource_ops &&
         HasResourceInputOrOutput(*node)) {
-      VLOG(2) << "Compilation rejected node: resource input/output "
-              << node->name() << ": " << node->type_string();
+      VLOG(2) << "Rejecting: " << node->name() << ": resource input/output "
+              << node->type_string();
       continue;
     }
     if (node->type_string() == "While" &&
@@ -401,15 +404,11 @@
     // _Arg nodes in a top-level function represent feeds.
     // Do not compile them.
     if (node->type_string() == "_Arg") {
-      VLOG(2) << "Skipping jit compilation for '_Arg'-typed node "
-              << node->DebugString();
       continue;
     }
     // _Retval nodes in a top-level function represent fetches.
     // Do not compile them.
     if (node->type_string() == "_Retval") {
-      VLOG(2) << "Compilation rejected node: return value " << node->name()
-              << ": " << node->type_string();
       continue;
     }
     candidates->insert(node);
@@ -475,6 +474,7 @@
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
                                              &registration)) {
+      VLOG(2) << "Rejecting " << node->name() << ": could not find JIT device.";
       return false;
     }
 
@@ -484,21 +484,36 @@
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
     Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
-    if (status.ok()) return compile;
+    if (status.ok()) {
+      if (!compile) {
+        VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr("
+                << kXlaCompileAttr << ") is false.";
+      }
+      return compile;
+    }
 
     status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
-    if (status.ok()) return compile;
+    if (status.ok()) {
+      if (!compile) {
+        VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr("
+                << kXlaCompileAttr << ") on callee is false.";
+      }
+      return compile;
+    }
 
     // If inputs to `node` can have conflicting deadness (i.e. some are alive
     // and some are dead) then don't compile it.  XLA cannot represent the
     // deadness semantics of these nodes correctly and auto-clustering these
     // nodes can cause deadness to propagate to nodes that should be live.
     if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
+      VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
       return false;
     }
 
     // Check for fusable ops only if requested.
     if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) {
+      VLOG(2) << "Rejecting " << node->name()
+              << ": not fusable op but fusion_only enabled.";
       return false;
     }
 
@@ -506,12 +521,75 @@
     // Ignore enable_jit_by_default if global jit compilation for CPU
     // is explicitly requested via tf_xla_cpu_global_jit flag
     bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
-    return (ignore_registration || registration->enable_jit_by_default) &&
-           global_jit_level > 0;
+    bool should_compile =
+        (ignore_registration || registration->enable_jit_by_default) &&
+        global_jit_level > 0;
+    if (!should_compile) {
+      if (global_jit_level <= 0) {
+        VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
+      } else {
+        VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled.";
+      }
+    }
+    return should_compile;
   };
   return RunImpl(options, is_compilable);
 }
 
+static string RatioToString(int numerator, int denominator) {
+  return strings::Printf("%d / %d (%.2f%%)", numerator, denominator,
+                         (100.0 * numerator) / denominator);
+}
+
+static void VLogClusteringSummary(const Graph& g) {
+  if (!VLOG_IS_ON(2)) {
+    return;
+  }
+
+  std::map<StringPiece, int> cluster_name_to_size;
+  std::map<StringPiece, std::map<StringPiece, int>>
+      cluster_name_to_op_histogram;
+  std::map<StringPiece, int> unclustered_op_histogram;
+  int clustered_node_count = 0;
+
+  for (Node* n : g.nodes()) {
+    gtl::optional<StringPiece> cluster_name = GetXlaClusterForNode(*n);
+    if (cluster_name) {
+      clustered_node_count++;
+      cluster_name_to_size[*cluster_name]++;
+      cluster_name_to_op_histogram[*cluster_name][n->type_string()]++;
+    } else {
+      unclustered_op_histogram[n->type_string()]++;
+    }
+  }
+
+  int unclustered_node_count = g.num_nodes() - clustered_node_count;
+
+  VLOG(2) << "*** Clustering info for graph of size " << g.num_nodes();
+  VLOG(2) << " Built " << cluster_name_to_size.size() << " clusters, size "
+          << RatioToString(clustered_node_count, g.num_nodes());
+
+  for (const auto& cluster_name_size_pair : cluster_name_to_size) {
+    StringPiece cluster_name = cluster_name_size_pair.first;
+    int size = cluster_name_size_pair.second;
+    VLOG(2) << "  " << cluster_name << " "
+            << RatioToString(size, g.num_nodes());
+    for (const auto& op_count_pair :
+         cluster_name_to_op_histogram[cluster_name]) {
+      VLOG(3) << "   " << op_count_pair.first << ": " << op_count_pair.second
+              << " instances";
+    }
+  }
+
+  if (!unclustered_op_histogram.empty()) {
+    VLOG(2) << " Unclustered nodes: "
+            << RatioToString(unclustered_node_count, g.num_nodes());
+    for (const auto& pair : unclustered_op_histogram) {
+      VLOG(3) << "  " << pair.first << ": " << pair.second << " instances";
+    }
+  }
+}
+
 // Is 'node' an operator that consumes only the shape of its input, not the
 // data itself?
 static bool IsShapeConsumerOp(const Node& node) {
@@ -700,6 +778,9 @@
     dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph,
                                 options.flib_def);
   }
+
+  VLogClusteringSummary(*graph);
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index e9acbfb..f1137af 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -40,20 +40,18 @@
 
   Status Run(const GraphOptimizationPassOptions& options) override;
 
-  // Run() just calls RunImpl() if --tf_xla_auto_jit is enabled. To run the pass
-  // unconditionally, call RunImpl() directly.
-  // is_compilable_fn, if set, is a predicate that must be true for a node to
-  // be compiled.
+ private:
   Status RunImpl(const GraphOptimizationPassOptions& options,
                  const std::function<bool(const Node*, const DeviceType&)>&
                      is_compilable_fn = {});
+
+  friend class MarkForCompilationPassTestHelper;
 };
 
 // Returns true iff 'ndef' is a call to a function that is compilable.  A
 // function is compilable iff every operator in the function body is
 // compilable.
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 2c5f4fb..a780d4a 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -39,27 +39,6 @@
 REGISTER_OP("UncompilableNullary").Output("o: float");
 REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
 
-Status MarkForCompilation(std::unique_ptr<Graph>* graph,
-                          FunctionLibraryDefinition* flib_def) {
-  // Assign all nodes to the CPU device.
-  static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
-  for (Node* n : (*graph)->nodes()) {
-    n->set_assigned_device_name(kCpuDevice);
-  }
-
-  GraphOptimizationPassOptions opt_options;
-  opt_options.graph = graph;
-  opt_options.flib_def = flib_def;
-  MarkForCompilationPass pass;
-  return pass.RunImpl(opt_options);
-}
-
-Status MarkForCompilation(std::unique_ptr<Graph>* graph) {
-  FunctionDefLibrary flib;
-  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
-  return MarkForCompilation(graph, &flib_def);
-}
-
 std::unordered_map<string, string> GetClusters(const Graph& graph) {
   std::unordered_map<string, string> ids;
   for (Node* node : graph.nodes()) {
@@ -88,7 +67,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_EQ(4, clusters.size());
   EXPECT_EQ(clusters["B"], clusters["C"]);
@@ -113,7 +92,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_TRUE(clusters.empty());
@@ -133,7 +112,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_EQ(3, clusters.size());
@@ -156,7 +135,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_TRUE(clusters.empty());
 }
@@ -177,7 +156,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_FALSE(clusters.empty());
 }
@@ -206,7 +185,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_EQ(3, clusters.size());  // Everything should be compiled.
 }
@@ -241,7 +220,8 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph, &flib_def));
+  TF_ASSERT_OK(
+      MarkForCompilationPassTestHelper::MarkForCompilation(&graph, &flib_def));
   auto clusters = GetClusters(*graph);
 
   EXPECT_EQ(2, clusters.size());
@@ -272,7 +252,7 @@
     ops::UnaryOp("Shape", d, builder.opts().WithName("E"));
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_EQ(0, clusters.size());  // Nothing should be compiled.
 }
@@ -359,7 +339,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_EQ(2, clusters.size());
@@ -384,7 +364,7 @@
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   // Nothing should be compiled. In particular, 'd' and 'c' must not be
@@ -411,7 +391,7 @@
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   // The computation is: C = A + relu(A)
@@ -442,7 +422,7 @@
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   // The computation is: D = relu(A) + (A @ relu(A))
@@ -472,7 +452,7 @@
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   // The computation is: C = A @ relu(A)
@@ -512,7 +492,7 @@
     ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
   EXPECT_EQ(0, clusters.size());  // Nothing should be compiled.
 }
@@ -542,7 +522,7 @@
 
   TF_EXPECT_OK(root.ToGraph(graph.get()));
 
-  Status status = MarkForCompilation(&graph);
+  Status status = MarkForCompilationPassTestHelper::MarkForCompilation(&graph);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(status.ToString(),
                                     "Edge from c to a would create a cycle.\n"
@@ -570,7 +550,7 @@
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_EQ(2, clusters.size());
@@ -588,7 +568,7 @@
     auto r = ops::_Retval(root.WithOpName("R"), c, 0);
   }
   TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_TRUE(clusters.empty());
@@ -604,7 +584,7 @@
     auto r = ops::_Retval(root.WithOpName("R"), b, 0);
   }
   TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
   EXPECT_TRUE(clusters.empty());
@@ -618,7 +598,7 @@
     auto c = ops::Const(root.WithOpName("const"), 0.5f);
     c.node()->AddAttr(kXlaCompileAttr, true);
     TF_ASSERT_OK(root.ToGraph(graph.get()));
-    TF_ASSERT_OK(MarkForCompilation(&graph));
+    TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
     EXPECT_EQ(1, GetClusters(*graph).size());
   }
 
@@ -629,7 +609,7 @@
     auto c = ops::Const(root.WithOpName("const"), string("string"));
     c.node()->AddAttr(kXlaCompileAttr, true);
     TF_ASSERT_OK(root.ToGraph(graph.get()));
-    TF_ASSERT_OK(MarkForCompilation(&graph));
+    TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
     EXPECT_TRUE(GetClusters(*graph).empty());
   }
 }
@@ -644,7 +624,7 @@
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
@@ -667,7 +647,7 @@
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
@@ -699,7 +679,7 @@
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilation(&graph));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
new file mode 100644
index 0000000..a84b82e
--- /dev/null
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
+
+namespace tensorflow {
+/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def) {
+  // Assign all nodes to the CPU device.
+  static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
+  for (Node* n : (*graph)->nodes()) {
+    n->set_assigned_device_name(kCpuDevice);
+  }
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  opt_options.flib_def = flib_def;
+  MarkForCompilationPass pass;
+  return pass.RunImpl(opt_options);
+}
+
+/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
+    std::unique_ptr<Graph>* graph) {
+  FunctionDefLibrary flib;
+  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
+  return MarkForCompilation(graph, &flib_def);
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
new file mode 100644
index 0000000..b9a0531
--- /dev/null
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
+#define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
+
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+
+namespace tensorflow {
+class MarkForCompilationPassTestHelper {
+ public:
+  // Runs the MarkForCompilation pass on `graph` after assigning all nodes in
+  // `graph` to the CPU device.  To make testing easier, ignores device
+  // registration, _XlaCompile attributes, input deadness and global jit level.
+  static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
+                                   FunctionLibraryDefinition* flib_def);
+
+  // Like `MarkForCompilation` but creates `flib_def` from the op registry.
+  static Status MarkForCompilation(std::unique_ptr<Graph>* graph);
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
new file mode 100644
index 0000000..68ead39
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace {
+Status FindNodesToDecluster(const Graph& graph, gtl::FlatSet<Node*>* result,
+                            gtl::ArraySlice<Node*> post_order) {
+  // Find nodes that have at least one user outside their cluster that expects
+  // hostmem output.  These nodes should be cloned to outside the cluster to
+  // avoid the device-host copy we'd otherwise need.
+
+  MemoryTypeVector input_mtypes, output_mtypes;
+
+  for (Node* n : post_order) {
+    gtl::optional<StringPiece> from_cluster = GetXlaClusterForNode(*n);
+    if (!from_cluster) {
+      continue;
+    }
+
+    // We assume the only XLA-auto-clusterable operations with side effects are
+    // resource variable updates.  We can't execute these twice.
+    if (HasResourceInputOrOutput(*n)) {
+      continue;
+    }
+
+    DeviceType device_type("");
+    TF_RETURN_IF_ERROR(
+        DeviceToDeviceType(n->assigned_device_name(), &device_type));
+    TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
+                                          n->def(), &input_mtypes,
+                                          &output_mtypes));
+    for (const Edge* e : n->out_edges()) {
+      Node* dst = e->dst();
+
+      if (e->IsControlEdge()) {
+        continue;
+      }
+
+      bool edge_incurs_extra_device_to_host_copy;
+      if (output_mtypes[e->src_output()] == DEVICE_MEMORY) {
+        // If the output of the *TensorFlow* operation is in DEVICE_MEMORY then
+        // keep the node clustered -- XLA will also produce the output in device
+        // memory and we will get some benefit from clustering.
+        edge_incurs_extra_device_to_host_copy = false;
+      } else {
+        MemoryTypeVector dst_input_mtypes, dst_output_mtypes;
+        DeviceType dst_device_type("");
+        TF_RETURN_IF_ERROR(
+            DeviceToDeviceType(dst->assigned_device_name(), &dst_device_type));
+        TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
+                                              dst->def(), &dst_input_mtypes,
+                                              &dst_output_mtypes));
+        edge_incurs_extra_device_to_host_copy =
+            dst_input_mtypes[e->dst_input()] == HOST_MEMORY;
+      }
+
+      if (!edge_incurs_extra_device_to_host_copy) {
+        continue;
+      }
+
+      // Check if `dst` is in a different cluster, unclustered, or about to be
+      // partially declustered (here we rely on the post-order traversal order).
+      // If yes, decluster `n` to avoid the device-to-host memcpy.
+      gtl::optional<StringPiece> dst_cluster =
+          result->count(dst) ? gtl::nullopt : GetXlaClusterForNode(*dst);
+      if (from_cluster != dst_cluster) {
+        CHECK(result->insert(n).second);
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status PartiallyDeclusterNode(Graph* graph, Node* n) {
+  StringPiece cluster_name = *GetXlaClusterForNode(*n);
+  gtl::InlinedVector<const Edge*, 6> out_edges_to_clone;
+  for (const Edge* out_edge : n->out_edges()) {
+    if (out_edge->IsControlEdge()) {
+      continue;
+    }
+
+    Node* dst = out_edge->dst();
+    gtl::optional<StringPiece> dst_cluster_name = GetXlaClusterForNode(*dst);
+    if (dst_cluster_name != cluster_name) {
+      out_edges_to_clone.push_back(out_edge);
+    }
+  }
+
+  CHECK(!out_edges_to_clone.empty()) << n->DebugString();
+
+  NodeDef ndef = n->def();
+  ndef.set_name(strings::StrCat(n->name(), "/declustered"));
+  RemoveFromXlaCluster(&ndef);
+  Status s;
+  Node* cloned_node = graph->AddNode(ndef, &s);
+  cloned_node->set_assigned_device_name(n->assigned_device_name());
+  TF_RETURN_IF_ERROR(s);
+
+  for (const Edge* in_edge : n->in_edges()) {
+    graph->AddEdge(in_edge->src(), in_edge->src_output(), cloned_node,
+                   in_edge->dst_input());
+  }
+
+  for (const Edge* out_edge_to_clone : out_edges_to_clone) {
+    graph->AddEdge(cloned_node, out_edge_to_clone->src_output(),
+                   out_edge_to_clone->dst(), out_edge_to_clone->dst_input());
+    graph->RemoveEdge(out_edge_to_clone);
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status PartiallyDeclusterPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  // NB!  In this pass we assume the only XLA-auto-clusterable operations that
+  // may have side effects are resource variable operations so we don't cluster
+  // those.  The pass will have to be updated if this assumption becomes
+  // invalid.
+
+  Graph* graph = options.graph->get();
+
+  // When deciding whether to decluster a particular node, we base our decision
+  // on if we've decided that some of its consumers have to be declustered too.
+  // Iterating the graph in post-order guarantees that consumers have been
+  // visited before producers.
+  std::vector<Node*> post_order;
+  GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
+               /*edge_filter=*/[](const Edge& edge) {
+                 return !edge.src()->IsNextIteration();
+               });
+
+  gtl::FlatSet<Node*> nodes_to_partially_decluster;
+  TF_RETURN_IF_ERROR(FindNodesToDecluster(
+      **options.graph, &nodes_to_partially_decluster, post_order));
+
+  if (VLOG_IS_ON(3)) {
+    for (Node* n : post_order) {
+      if (nodes_to_partially_decluster.count(n)) {
+        VLOG(3) << n->DebugString();
+      }
+    }
+  }
+
+  for (Node* n : post_order) {
+    if (nodes_to_partially_decluster.count(n)) {
+      TF_RETURN_IF_ERROR(PartiallyDeclusterNode(graph, n));
+    }
+  }
+
+  nodes_to_partially_decluster.clear();
+  TF_RETURN_IF_ERROR(FindNodesToDecluster(
+      **options.graph, &nodes_to_partially_decluster, post_order));
+  CHECK(nodes_to_partially_decluster.empty());
+
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.h b/tensorflow/compiler/jit/partially_decluster_pass.h
new file mode 100644
index 0000000..6949b50
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Clones nodes from within a cluster to outside the cluster if profitable.
+//
+// Today this only clones to avoid device-to-host copies, but in the future we
+// may consider other reasons to clone.  For instance, we convert this:
+//
+//         .....
+//           |
+//           v
+//      A_Clustered ====> C_Unclustered
+//           |
+//           v
+//      B_Clustered
+//
+// to:
+//
+//         .....
+//          | |
+//          | +-------------+
+//          |               |
+//          v               v
+//      A_Clustered   A_Unclustered ====> C_Unclustered
+//           |
+//           v
+//      B_Clustered
+//
+// where the ===> arrow has a hostmem source and destination and would entail a
+// device to host copy if the source and destination were not in the same XLA
+// cluster.
+class PartiallyDeclusterPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
new file mode 100644
index 0000000..08a956e
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -0,0 +1,284 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+REGISTER_OP("FakeNullary").Output("out: float");
+
+REGISTER_OP("FakeBinary")
+    .Input("host_in: float")
+    .Input("device_in: float")
+    .Output("host_out: float")
+    .Output("device_out: float");
+
+REGISTER_OP("FakeResourceVar").Output("out: resource");
+
+REGISTER_OP("FakeResourceUpdate")
+    .Input("in: resource")
+    .Output("out: resource")
+    .Output("something_else: float");
+
+class FakeBinaryOp : public OpKernel {
+ public:
+  explicit FakeBinaryOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override { CHECK(false); }
+};
+
+class FakeResourceVarUpdateOp : public OpKernel {
+ public:
+  explicit FakeResourceVarUpdateOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override { CHECK(false); }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeBinary")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("host_in")
+                            .HostMemory("host_out"),
+                        FakeBinaryOp);
+
+REGISTER_KERNEL_BUILDER(Name("FakeResourceVarUpdate")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("something_else"),
+                        FakeResourceVarUpdateOp);
+
+Status PartiallyDecluster(std::unique_ptr<Graph>* graph) {
+  FixupSourceAndSinkEdges(graph->get());
+  // Assign all nodes to the CPU device.
+  static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
+  for (Node* n : (*graph)->nodes()) {
+    n->set_assigned_device_name(kCpuDevice);
+  }
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  PartiallyDeclusterPass pass;
+  return pass.Run(opt_options);
+}
+
+const Node* FindNodeByName(const Graph& graph, const string& name) {
+  for (const Node* node : graph.nodes()) {
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+bool GetInputsForNode(const Graph& graph, const string& node_name,
+                      std::vector<Node*>* inputs) {
+  const Node* node = FindNodeByName(graph, node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  for (const Edge* e : node->in_edges()) {
+    inputs->push_back(e->src());
+  }
+  std::sort(inputs->begin(), inputs->end(), NodeComparatorName());
+  return true;
+}
+
+TEST(PartiallyDeclusterPassTest, ClusteredAndUnclustered) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    ops::BinaryOp("FakeBinary", clustered_producer, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input,
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> unclustered_consumer_inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer",
+                               &unclustered_consumer_inputs));
+  ASSERT_EQ(unclustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(unclustered_consumer_inputs[0]->name(),
+            "ClusteredProducer/declustered");
+  EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input");
+
+  std::vector<Node*> clustered_consumer_inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredConsumer",
+                               &clustered_consumer_inputs));
+  ASSERT_EQ(clustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(clustered_consumer_inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(clustered_consumer_inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DifferentClusters) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", clustered_producer, input,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer/declustered");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DontDeclusterIfUserIsDeviceMem) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    // The first input is hostmem and the second input is devicemem.
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", input, clustered_producer,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DontDuplicateResourceVarOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* resource_var = ops::SourceOp("FakeResourceVar",
+                                       builder.opts().WithName("ResourceVar"));
+    Node* clustered_producer =
+        ops::UnaryOp("FakeResourceUpdate", resource_var,
+                     builder.opts().WithName("ClusteredProducer"));
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DeclusterDependentNodes) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer_0 =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer0"));
+    Node* clustered_producer_1 =
+        ops::BinaryOp("FakeBinary", clustered_producer_0, input,
+                      builder.opts().WithName("ClusteredProducer1"));
+    ops::BinaryOp("FakeBinary", clustered_producer_1, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", {clustered_producer_1, 1}, input,
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> unclustered_consumer_inputs, declustered_producer_1_inputs;
+
+  ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer",
+                               &unclustered_consumer_inputs));
+  ASSERT_EQ(unclustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(unclustered_consumer_inputs[0]->name(),
+            "ClusteredProducer1/declustered");
+  EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input");
+
+  ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredProducer1/declustered",
+                               &declustered_producer_1_inputs));
+  ASSERT_EQ(declustered_producer_1_inputs.size(), 2);
+  EXPECT_EQ(declustered_producer_1_inputs[0]->name(),
+            "ClusteredProducer0/declustered");
+  EXPECT_EQ(declustered_producer_1_inputs[1]->name(), "Input");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index a5628b1..0a025a1 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -185,4 +185,26 @@
   return Status::OK();
 }
 
+gtl::optional<StringPiece> GetXlaClusterForNode(const Node& node) {
+  const AttrValue* attr_value = node.attrs().Find(kXlaClusterAttr);
+  if (attr_value == nullptr) {
+    return gtl::nullopt;
+  }
+  Status s = AttrValueHasType(*attr_value, "string");
+  if (!s.ok()) {
+    return gtl::nullopt;
+  }
+  return attr_value->s();
+}
+
+bool HasResourceInputOrOutput(const Node& node) {
+  return std::find(node.input_types().begin(), node.input_types().end(),
+                   DT_RESOURCE) != node.input_types().end() ||
+         std::find(node.output_types().begin(), node.output_types().end(),
+                   DT_RESOURCE) != node.output_types().end();
+}
+
+void RemoveFromXlaCluster(NodeDef* node_def) {
+  node_def->mutable_attr()->erase(kXlaClusterAttr);
+}
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index bcce082..bff76da 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -20,6 +20,7 @@
 
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace tensorflow {
 
@@ -44,6 +45,16 @@
 // the enclosing graph.
 Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles);
 
+// Returns the XLA cluster in which `node` is placed if it is in an XLA cluster,
+// otherwise returns nullopt.
+gtl::optional<StringPiece> GetXlaClusterForNode(const Node& node);
+
+// Removes `node_def` its XLA cluster (by clearing its _XlaCluster attribute).
+void RemoveFromXlaCluster(NodeDef* node_def);
+
+// Returns true if `node` has a DT_RESOURCE typed input or output.
+bool HasResourceInputOrOutput(const Node& node);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index d288d37..dd84fb3 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -18,6 +18,7 @@
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
@@ -71,13 +72,14 @@
   run_options.set_stream(stream);
   run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
-  run_options.set_rng_seed(ctx->step_id());
+  run_options.set_rng_seed(GetXLARandomSeed());
 
   xla::StatusOr<xla::ScopedShapedBuffer> run_result =
       executable->Run(launch_context.arguments(), run_options);
   TF_RETURN_IF_ERROR(run_result.status());
 
-  launch_context.PopulateOutputs(ctx, result, run_result.ConsumeValueOrDie());
+  TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
+      ctx, result, run_result.ConsumeValueOrDie()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 4ddeaeb..2a2691a 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -26,6 +26,7 @@
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -216,6 +217,8 @@
       transfer_as_literal_(transfer_as_literal),
       shape_representation_fn_(shape_representation_fn) {
   VLOG(1) << "Created XLA device " << jit_device_name << " " << this;
+  thread_pool_.reset(new thread::ThreadPool(options.env, "xla_device",
+                                            /*num_threads=*/1));
 }
 
 XlaDevice::~XlaDevice() {
@@ -262,10 +265,12 @@
 
 Status XlaDevice::EnsureStreamOkLocked(xla::Backend* backend,
                                        const string& name,
-                                       xla::StreamPool::Ptr* stream,
+                                       std::shared_ptr<se::Stream>* stream,
                                        bool* stream_was_changed) {
   if (!(*stream) || !(*stream)->ok()) {
-    TF_ASSIGN_OR_RETURN(*stream, backend->BorrowStream(device_ordinal_));
+    xla::StreamPool::Ptr ptr;
+    TF_ASSIGN_OR_RETURN(ptr, backend->BorrowStream(device_ordinal_));
+    *stream = std::shared_ptr<se::Stream>(std::move(ptr));
     VLOG(1) << "XlaDevice " << this << " new " << name << " "
             << (*stream)->DebugStreamPointers();
     *stream_was_changed = true;
@@ -281,8 +286,8 @@
   TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "stream", &stream_,
                                           &need_new_device_context));
 
-  se::Stream* host_to_device_stream = stream_.get();
-  se::Stream* device_to_host_stream = stream_.get();
+  std::shared_ptr<se::Stream> host_to_device_stream = stream_;
+  std::shared_ptr<se::Stream> device_to_host_stream = stream_;
   if (use_multiple_streams_) {
     TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "host_to_device_stream",
                                             &host_to_device_stream_,
@@ -290,8 +295,8 @@
     TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "device_to_host_stream",
                                             &device_to_host_stream_,
                                             &need_new_device_context));
-    host_to_device_stream = host_to_device_stream_.get();
-    device_to_host_stream = device_to_host_stream_.get();
+    host_to_device_stream = host_to_device_stream_;
+    device_to_host_stream = device_to_host_stream_;
   }
 
   if (!need_new_device_context) {
@@ -304,9 +309,13 @@
   if (device_context_) {
     device_context_->Unref();
   }
+  // The XlaDeviceContext keeps a reference count to the streams, and the
+  // XlaDeviceContext remains live for the duration of a Executor run. This
+  // ensures that the streams remain live for the duration of a run, even if
+  // an error is encountered and the streams are replaced with new ones.
   device_context_ = new XlaDeviceContext(
-      stream_.get(), host_to_device_stream, device_to_host_stream, client(),
-      transfer_as_literal_, shape_representation_fn_);
+      stream_, host_to_device_stream, device_to_host_stream, client(),
+      transfer_as_literal_, shape_representation_fn_, thread_pool_.get());
   VLOG(1) << "XlaDevice " << this << " new XlaDeviceContext "
           << device_context_;
 
@@ -371,6 +380,22 @@
   op_kernel->ComputeAsync(context, done);
 }
 
+Status XlaDevice::Sync() {
+  VLOG(1) << "XlaDevice::Sync";
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) return Status::OK();
+
+  if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) {
+    return errors::Internal("XlaDevice::Sync() failed.");
+  }
+  VLOG(1) << "XlaDevice::Sync completed";
+  return Status::OK();
+}
+
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index d890641..dbf35f3 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -30,7 +30,6 @@
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -124,7 +123,7 @@
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
-  Status Sync() override { return Status::OK(); }
+  Status Sync() override;
 
   Status FillContextMap(const Graph* graph,
                         DeviceContextMap* device_context_map) override
@@ -153,7 +152,7 @@
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status EnsureStreamOkLocked(xla::Backend* backend, const string& name,
-                              xla::StreamPool::Ptr* stream,
+                              std::shared_ptr<se::Stream>* stream,
                               bool* stream_was_changed)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
   xla::StatusOr<XlaDeviceContext*> GetDeviceContextLocked()
@@ -174,17 +173,17 @@
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
-  xla::StreamPool::Ptr stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> stream_ GUARDED_BY(mu_);
   // If false, only stream_ is valid and all computation and transfers use
   // stream_. If true, computation is performed by stream_ and transfers are
   // performed by host_to_device/device_to_host_stream.
   const bool use_multiple_streams_;
   // If use_multiple_streams_, host to device transfers are performed using this
   // stream.
-  xla::StreamPool::Ptr host_to_device_stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> host_to_device_stream_ GUARDED_BY(mu_);
   // If use_multiple_streams_, device to host transfers are performed using this
   // stream.
-  xla::StreamPool::Ptr device_to_host_stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> device_to_host_stream_ GUARDED_BY(mu_);
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   const bool transfer_as_literal_;
@@ -198,6 +197,9 @@
   // Holds extra information for GPU and TPU devices, e.g. the device context.
   bool use_gpu_device_info_ GUARDED_BY(mu_) = false;
   std::unique_ptr<GpuDeviceInfo> gpu_device_info_ GUARDED_BY(mu_);
+
+  // Thread pool used for running closures
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 8cf1982..0a0c089 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -15,6 +15,9 @@
 
 #include "tensorflow/compiler/jit/xla_device_context.h"
 
+#include <memory>
+
+#include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -48,17 +51,20 @@
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
 XlaTransferManager::XlaTransferManager(
-    se::Stream* compute_stream, se::Stream* host_to_device_stream,
-    se::Stream* device_to_host_stream, xla::LocalClient* client,
+    std::shared_ptr<se::Stream> compute_stream,
+    std::shared_ptr<se::Stream> host_to_device_stream,
+    std::shared_ptr<se::Stream> device_to_host_stream, xla::LocalClient* client,
     bool transfer_as_literal,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
-    : stream_(compute_stream),
-      host_to_device_stream_(host_to_device_stream),
-      device_to_host_stream_(device_to_host_stream),
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    thread::ThreadPool* thread_pool)
+    : stream_(std::move(compute_stream)),
+      host_to_device_stream_(std::move(host_to_device_stream)),
+      device_to_host_stream_(std::move(device_to_host_stream)),
       client_(client),
       transfer_manager_(client->backend().transfer_manager()),
       transfer_as_literal_(transfer_as_literal),
-      shape_representation_fn_(std::move(shape_representation_fn)) {
+      shape_representation_fn_(std::move(shape_representation_fn)),
+      thread_pool_(thread_pool) {
   CHECK(host_to_device_stream_ != nullptr);
   CHECK(device_to_host_stream_ != nullptr);
   CHECK(stream_ != nullptr);
@@ -88,47 +94,40 @@
   if (UseMultipleStreams()) {
     // Initially wait for the compute stream so that memory allocations are
     // synchronized.
-    host_to_device_stream_->ThenWaitFor(stream_);
+    host_to_device_stream_->ThenWaitFor(stream_.get());
   }
   TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      host_to_device_stream_, *literal, shaped_buffer));
+      host_to_device_stream_.get(), *literal, shaped_buffer));
   if (UseMultipleStreams()) {
-    se::Event event(stream_->parent());
-    TF_RET_CHECK(event.Init()) << "Event failed to initialize!";
-    host_to_device_stream_->ThenRecordEvent(&event);
-    xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event));
+    auto event = std::make_shared<se::Event>(stream_->parent());
+    TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
+    host_to_device_stream_->ThenRecordEvent(event.get());
+    xla_tensor->SetDefinedOn(host_to_device_stream_.get(), std::move(event));
   }
   // Unref the host tensor, and capture the literal shared_ptr too so it goes
   // out of scope when the lambda completes.
   host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
+
   return Status::OK();
 }
 
 void XlaTransferManager::TransferLiteralFromDevice(
     Tensor* host_tensor, const Tensor& device_tensor,
     const StatusCallback& done) const {
+  xla::MutableBorrowingLiteral literal;
+  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(host_tensor, &literal));
+
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
 
   TensorReference ref(device_tensor);
   transfer_manager_->TransferLiteralFromDevice(
-      device_to_host_stream_, shaped_buffer,
-      [=, &shaped_buffer](
-          xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
+      device_to_host_stream_.get(), shaped_buffer, literal,
+      [=, &shaped_buffer, &literal](xla::Status status) {
         ref.Unref();
         done([&]() -> Status {
-          TF_ASSIGN_OR_RETURN(auto literal, std::move(literal_or));
-          VLOG(1) << "Transfer from device as literal: " << literal->ToString()
+          VLOG(1) << "Transfer from device as literal: " << literal.ToString()
                   << " " << shaped_buffer.ToString();
-          Tensor tensor;
-          TF_RETURN_IF_ERROR(
-              LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
-          // Reshape the tensor back to its declared shape.
-          Status status;
-          if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
-            status = errors::Internal(
-                "Tensor::CopyFrom failed when copying from XLA device to CPU");
-          }
           return status;
         }());
       });
@@ -186,8 +185,14 @@
     status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
     if (status.ok()) {
       xla_tensor->set_host_tensor(*cpu_tensor);
-      host_to_device_stream_->ThenDoHostCallback(
-          [done]() { done(Status::OK()); });
+      host_to_device_stream_->ThenDoHostCallback([this, done]() {
+        // We must not call the done closure directly from DoHostCallback
+        // to avoid a deadlock. If done() is the callback that ends an
+        // Executor's run, the Executor may call XlaDevice::Sync() inside the
+        // callback. This deadlocks, because XlaDevice::Sync() waits for all
+        // stream activity to complete.
+        thread_pool_->Schedule([done]() { done(Status::OK()); });
+      });
       return;
     }
   } else {
@@ -199,7 +204,7 @@
     if (!block_status.ok()) {
       status = xla::InternalError(
           "Failed to complete data transfer on stream %p: %s",
-          host_to_device_stream_, block_status.error_message().c_str());
+          host_to_device_stream_.get(), block_status.error_message().c_str());
     }
   }
   xla_tensor->set_host_tensor(*cpu_tensor);
@@ -232,9 +237,9 @@
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
 
   if (se::Event* event =
-          xla_tensor->GetDefinitionEvent(device_to_host_stream_)) {
+          xla_tensor->GetDefinitionEvent(device_to_host_stream_.get())) {
     device_to_host_stream_->ThenWaitFor(event);
-    xla_tensor->SetDefinedOn(device_to_host_stream_);
+    xla_tensor->SetDefinedOn(device_to_host_stream_.get());
   }
 
   Status status;
@@ -247,7 +252,7 @@
     Status block_status = device_to_host_stream_->BlockHostUntilDone();
     if (!block_status.ok()) {
       status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_,
+          "Failed to complete data transfer on stream %p: %s", stream_.get(),
           block_status.error_message().c_str());
     }
   }
@@ -285,14 +290,14 @@
       if (stream_ != device_to_device_stream) {
         // Initially wait for the compute stream so that memory allocations are
         // synchronized.
-        device_to_device_stream->ThenWaitFor(stream_);
+        device_to_device_stream->ThenWaitFor(stream_.get());
       }
     }
 
     if (se::Event* event =
-            xla_src->GetDefinitionEvent(device_to_device_stream)) {
+            xla_src->GetDefinitionEvent(device_to_device_stream.get())) {
       device_to_device_stream->ThenWaitFor(event);
-      xla_src->SetDefinedOn(device_to_device_stream);
+      xla_src->SetDefinedOn(device_to_device_stream.get());
     }
 
     auto from_iter = xla_src->shaped_buffer().buffers().begin();
@@ -304,28 +309,37 @@
     }
 
     if (UseMultipleStreams()) {
-      se::Event event(stream_->parent());
-      CHECK(event.Init());
-      device_to_device_stream->ThenRecordEvent(&event);
-      xla_dst->SetDefinedOn(device_to_device_stream, std::move(event));
+      auto event = std::make_shared<se::Event>(stream_->parent());
+      TF_RET_CHECK(event->Init()) << "Event failed to initialize";
+      device_to_device_stream->ThenRecordEvent(event.get());
+      xla_dst->SetDefinedOn(device_to_device_stream.get(), std::move(event));
     }
     return Status::OK();
   }();
   if (!status.ok()) {
     return done(status);
   } else {
-    stream_->ThenDoHostCallback([=]() { done(Status::OK()); });
+    stream_->ThenDoHostCallback([this, done]() {
+      // We must not call the done closure directly from DoHostCallback to avoid
+      // a deadlock. If done() is the callback that ends an Executor's run, the
+      // Executor may call XlaDevice::Sync() inside the callback. This
+      // deadlocks, because XlaDevice::Sync() waits for all stream activity to
+      // complete.
+      thread_pool_->Schedule([done]() { done(Status::OK()); });
+    });
   }
 }
 
 XlaDeviceContext::XlaDeviceContext(
-    se::Stream* compute_stream, se::Stream* host_to_device_stream,
-    se::Stream* device_to_host_stream, xla::LocalClient* client,
+    std::shared_ptr<se::Stream> compute_stream,
+    std::shared_ptr<se::Stream> host_to_device_stream,
+    std::shared_ptr<se::Stream> device_to_host_stream, xla::LocalClient* client,
     bool transfer_as_literal,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
-    : manager_(compute_stream, host_to_device_stream, device_to_host_stream,
-               client, transfer_as_literal,
-               std::move(shape_representation_fn)) {}
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    thread::ThreadPool* thread_pool)
+    : manager_(std::move(compute_stream), std::move(host_to_device_stream),
+               std::move(device_to_host_stream), client, transfer_as_literal,
+               std::move(shape_representation_fn), thread_pool) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index 912f8d7..2e74453 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -47,10 +47,12 @@
 class XlaTransferManager {
  public:
   explicit XlaTransferManager(
-      se::Stream* compute_stream, se::Stream* host_to_device_stream,
-      se::Stream* device_to_host_stream, xla::LocalClient* client,
-      bool transfer_as_literal,
-      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
+      std::shared_ptr<se::Stream> compute_stream,
+      std::shared_ptr<se::Stream> host_to_device_stream,
+      std::shared_ptr<se::Stream> device_to_host_stream,
+      xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+      thread::ThreadPool* thread_pool);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done) const;
@@ -61,7 +63,7 @@
   void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor,
                                 const StatusCallback& done);
 
-  se::Stream* stream() const { return stream_; }
+  se::Stream* stream() const { return stream_.get(); }
 
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
@@ -73,13 +75,13 @@
 
   // The main compute stream of the device, used to synchronize the transfer
   // streams if they are set.
-  se::Stream* stream_;
+  std::shared_ptr<se::Stream> stream_;
   // The stream to use for transferring data from host to device. Can be
   // idential to stream_, but must not be nullptr.
-  se::Stream* host_to_device_stream_;
+  std::shared_ptr<se::Stream> host_to_device_stream_;
   // The stream to use for transferring data from device to host. Can be
   // idential to stream_, but must not be nullptr.
-  se::Stream* device_to_host_stream_;
+  std::shared_ptr<se::Stream> device_to_host_stream_;
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
   // Transfer manager, for marshalling data to and from the device.
@@ -87,6 +89,9 @@
   // True if we must use XLA's TransferManager for correct device transfers.
   const bool transfer_as_literal_;
   XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
+
+  // Thread pool used for running closures
+  thread::ThreadPool* thread_pool_;
 };
 
 // DeviceContext for operators assigned to XlaDevice devices. The
@@ -95,10 +100,12 @@
 class XlaDeviceContext : public DeviceContext {
  public:
   explicit XlaDeviceContext(
-      se::Stream* compute_stream, se::Stream* host_to_device_stream,
-      se::Stream* device_to_host_stream, xla::LocalClient* client,
-      bool transfer_as_literal,
-      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
+      std::shared_ptr<se::Stream> compute_stream,
+      std::shared_ptr<se::Stream> host_to_device_stream,
+      std::shared_ptr<se::Stream> device_to_host_stream,
+      xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+      thread::ThreadPool* thread_pool);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 6adda32..da3e329 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -23,7 +23,11 @@
 #include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/kernels/control_flow_ops.h"
+#include "tensorflow/core/kernels/data/generator_dataset_op.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
+#include "tensorflow/core/kernels/function_ops.h"
 #include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -166,7 +170,69 @@
       QueueIsClosedOp);                                                        \
                                                                                \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp);
+      Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp);   \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).HostMemory("output").TypeConstraint("T",     \
+                                                                      TYPES),  \
+      ArgOp);                                                                  \
+  REGISTER_KERNEL_BUILDER(Name(kArgOp)                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<ResourceHandle>("T"),            \
+                          ArgOp);                                              \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint("T", TYPES)                      \
+                              .HostMemory("input"),                            \
+                          RetvalOp);                                           \
+  REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<ResourceHandle>("T")             \
+                              .HostMemory("input"),                            \
+                          RetvalOp);                                           \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RemoteCall").Device(DEVICE).HostMemory("target"), RemoteCallOp);   \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("GeneratorDataset").Device(DEVICE).HostMemory("handle"),            \
+      GeneratorDatasetOp);                                                     \
+  REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")                              \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("buffer_size")                       \
+                              .HostMemory("input_dataset")                     \
+                              .HostMemory("handle"),                           \
+                          PrefetchDatasetOp);                                  \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE),                   \
+                          IteratorHandleOp);                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MakeIterator").Device(DEVICE).HostMemory("dataset"),               \
+      MakeIteratorOp);                                                         \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE),            \
+                          AnonymousIteratorHandleOp);                          \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
+                          IteratorGetNextOp);                                  \
+  REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("string_handle"),                    \
+                          IteratorToStringHandleOp);                           \
+  REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2")                   \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("string_handle"),                    \
+                          IteratorFromStringHandleOp);                         \
+  REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<string>("T"),                    \
+                          ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kRetOp)              \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<string>("T")                     \
+                              .HostMemory("input"),                            \
+                          RetvalOp);
 
 // TODO(phawkins): currently we do not register the QueueEnqueueMany,
 // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 6134b8c..4efbb2d 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -15,6 +15,8 @@
 
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -182,7 +184,7 @@
   }
 }
 
-void XlaComputationLaunchContext::PopulateOutputs(
+Status XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     ScopedShapedBuffer output) {
   se::Stream* stream =
@@ -211,6 +213,15 @@
     output = ScopedShapedBuffer(std::move(buffer), output.memory_allocator());
   }
 
+  std::shared_ptr<se::Event> definition_event;
+  if (use_multiple_streams_) {
+    definition_event = std::make_shared<se::Event>(stream->parent());
+    if (!definition_event->Init()) {
+      return errors::Internal("Failed to initialize tensor definition event.");
+    }
+    stream->ThenRecordEvent(definition_event.get());
+  }
+
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
@@ -228,12 +239,13 @@
         // reallocate the device buffer later.
         VLOG(1) << "Constant output tensor on device";
 
-        OP_REQUIRES_OK(
-            ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+        TF_RETURN_IF_ERROR(
+            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
 
         Device* device = dynamic_cast<Device*>(ctx->device());
-        OP_REQUIRES(ctx, device != nullptr,
-                    errors::Internal("DeviceBase was not a Device."));
+        if (device == nullptr) {
+          return errors::Internal("DeviceBase was not a Device.");
+        }
         ctx->op_device_context()->CopyCPUTensorToDevice(
             &const_tensor, device, output_tensor,
             [&](Status status) { TF_CHECK_OK(status); });
@@ -263,16 +275,13 @@
       se::DeviceMemoryBase buffer = output.buffer({output_num});
       if (allocate_xla_tensors_) {
         Tensor* output_tensor;
-        OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
+        TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
         XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
         if (xla_tensor) {
           xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
               ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
           if (use_multiple_streams_) {
-            se::Event event(stream->parent());
-            CHECK(event.Init());
-            stream->ThenRecordEvent(&event);
-            xla_tensor->SetDefinedOn(stream, std::move(event));
+            xla_tensor->SetDefinedOn(stream, definition_event);
           }
         } else {
           // xla_tensor wasn't valid, which must mean this is a zero-element
@@ -298,41 +307,39 @@
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
     Allocator* allocator = ctx->device()->GetAllocator({});
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
-    OP_REQUIRES(ctx,
-                write.input_index >= 0 && write.input_index < ctx->num_inputs(),
-                errors::Internal("Invalid input index for variable write."));
+    if (write.input_index < 0 || write.input_index >= ctx->num_inputs()) {
+      return errors::Internal("Invalid input index for variable write.");
+    }
 
     se::DeviceMemoryBase buffer = output.buffer({output_num});
 
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
     // not a Tensor.
-    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
-                            ctx, HandleFromInput(ctx, write.input_index),
-                            &variable, [this, ctx, &write](Var** ptr) {
-                              *ptr = new Var(write.type);
-                              return Status::OK();
-                            }));
+    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
+        ctx, HandleFromInput(ctx, write.input_index), &variable,
+        [&write](Var** ptr) {
+          *ptr = new Var(write.type);
+          return Status::OK();
+        }));
 
     core::ScopedUnref s(variable);
 
     mutex_lock ml(*variable->mu());
-    OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
-                errors::Internal("Mismatched type in variable write"));
+    if (variable->tensor()->dtype() != write.type) {
+      return errors::Internal("Mismatched type in variable write");
+    }
 
     if (allocate_xla_tensors_) {
       Tensor output_tensor;
-      OP_REQUIRES_OK(
-          ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor));
+      TF_RETURN_IF_ERROR(
+          ctx->allocate_temp(write.type, write.shape, &output_tensor));
       XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
       CHECK(xla_tensor);
       xla_tensor->set_shaped_buffer(
           ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
       if (use_multiple_streams_) {
-        se::Event event(stream->parent());
-        CHECK(event.Init());
-        stream->ThenRecordEvent(&event);
-        xla_tensor->SetDefinedOn(stream, std::move(event));
+        xla_tensor->SetDefinedOn(stream, definition_event);
       }
       *variable->tensor() = output_tensor;
     } else {
@@ -343,6 +350,7 @@
     }
     ++output_num;
   }
+  return Status::OK();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 1ea3fa4..4232f51 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -93,9 +93,9 @@
                       const std::map<int, OptionalTensor>& variables);
 
   // Given the XLA output in `output`, populate all outputs of `ctx`.
-  void PopulateOutputs(OpKernelContext* ctx,
-                       const XlaCompiler::CompilationResult* kernel,
-                       xla::ScopedShapedBuffer output);
+  Status PopulateOutputs(OpKernelContext* ctx,
+                         const XlaCompiler::CompilationResult* kernel,
+                         xla::ScopedShapedBuffer output);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index d777dfa..92ba7de 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -75,7 +75,7 @@
 
 se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) {
   mutex_lock lock(mu_);
-  if (!definition_event_.has_value()) {
+  if (!definition_event_) {
     return nullptr;
   }
 
@@ -87,10 +87,11 @@
     return nullptr;
   }
 
-  return &*definition_event_;
+  return definition_event_.get();
 }
 
-void XlaTensor::SetDefinedOn(se::Stream* stream, se::Event event) {
+void XlaTensor::SetDefinedOn(se::Stream* stream,
+                             std::shared_ptr<se::Event> event) {
   mutex_lock lock(mu_);
   definition_event_ = std::move(event);
   streams_defined_on_ = {stream};
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index f7e401c..8d36d0f 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -16,6 +16,8 @@
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
 
+#include <memory>
+
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -94,7 +96,7 @@
 
   // Assert that the tensor's content is defined on 'stream' by the time 'event'
   // triggers.
-  void SetDefinedOn(se::Stream* stream, se::Event event);
+  void SetDefinedOn(se::Stream* stream, std::shared_ptr<se::Event> event);
 
   // Assert that the tensor's content is defined on 'stream'. This version does
   // not provide an event, and must be called *after* SetDefinedOn(Stream,
@@ -116,7 +118,7 @@
   // An optional event that is triggered when the tensor's content has been
   // defined. If this event is nullptr, it is assumed that the tensor's content
   // is always defined.
-  gtl::optional<se::Event> definition_event_;
+  std::shared_ptr<se::Event> definition_event_;
   // A list of all streams for which the tensor's content is defined for any
   // newly enqueued command.
   gtl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
index 03554d6..0d2e4d0 100644
--- a/tensorflow/compiler/tests/adam_test.py
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -52,6 +52,9 @@
 
   def testBasic(self):
     for dtype in self.float_types:
+      # TODO: test fails for float16 due to excessive precision requirements.
+      if dtype == np.float16:
+        continue
       with self.test_session(), self.test_scope():
         variable_scope.get_variable_scope().set_use_resource(True)
 
@@ -91,6 +94,9 @@
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
+      # TODO: test fails for float16 due to excessive precision requirements.
+      if dtype == np.float16:
+        continue
       with self.test_session(), self.test_scope():
         variable_scope.get_variable_scope().set_use_resource(True)
 
@@ -130,6 +136,9 @@
 
   def testSharing(self):
     for dtype in self.float_types:
+      # TODO: test fails for float16 due to excessive precision requirements.
+      if dtype == np.float16:
+        continue
       with self.test_session(), self.test_scope():
         variable_scope.get_variable_scope().set_use_resource(True)
 
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 0aafda7..5b7001b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1167,6 +1167,16 @@
     for dtype in self.numeric_types:
       self._testBinary(
           array_ops.tile,
+          np.array([[6], [3], [4]], dtype=dtype),
+          np.array([2, 0], dtype=np.int32),
+          expected=np.empty([6, 0], dtype=dtype))
+      self._testBinary(
+          array_ops.tile,
+          np.array([[6, 3, 4]], dtype=dtype),
+          np.array([2, 0], dtype=np.int32),
+          expected=np.empty([2, 0], dtype=dtype))
+      self._testBinary(
+          array_ops.tile,
           np.array([[6]], dtype=dtype),
           np.array([1, 2], dtype=np.int32),
           expected=np.array([[6, 6]], dtype=dtype))
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 422f36d..ff097f8 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -32,6 +32,7 @@
 from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -122,6 +123,14 @@
     with self.test_scope():
       self.assertAllEqual(2, array_ops.identity(2))
 
+  def testRandomOps(self):
+    with self.test_scope():
+      tensor = gen_random_ops.random_uniform((2, 2), dtypes.float32)
+      row0 = tensor[0].numpy()
+      row1 = tensor[1].numpy()
+      # It should be very unlikely to rng to generate two equal rows.
+      self.assertFalse((row0 == row1).all())
+
   def testIdentityOnVariable(self):
     with self.test_scope():
       v = resource_variable_ops.ResourceVariable(True)
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index cc0e9b2..8c4e16e 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -101,7 +101,7 @@
     for dtype in [dtypes.float32]:
       with self.test_session() as sess:
         with self.test_scope():
-          x = random_ops.truncated_normal(shape=[count], dtype=dtype, seed=42)
+          x = random_ops.truncated_normal(shape=[count], dtype=dtype)
         y = sess.run(x)
 
         def normal_cdf(x):
@@ -130,24 +130,18 @@
         # Department of Scientific Computing website. Florida State University.
         expected_mean = mu + (normal_pdf(alpha) - normal_pdf(beta)) / z * sigma
         actual_mean = np.mean(y)
-        atol = 2e-4
-        if self.device in ["XLA_GPU", "XLA_CPU"]:
-          atol = 2.2e-4
-        self.assertAllClose(actual_mean, expected_mean, atol=atol)
+        self.assertAllClose(actual_mean, expected_mean, atol=2e-3)
 
         expected_median = mu + probit(
             (normal_cdf(alpha) + normal_cdf(beta)) / 2.) * sigma
         actual_median = np.median(y)
-        self.assertAllClose(actual_median, expected_median, atol=1e-3)
+        self.assertAllClose(actual_median, expected_median, atol=1e-2)
 
         expected_variance = sigma**2 * (1 + (
             (alpha * normal_pdf(alpha) - beta * normal_pdf(beta)) / z) - (
                 (normal_pdf(alpha) - normal_pdf(beta)) / z)**2)
         actual_variance = np.var(y)
-        rtol = 1e-3
-        if self.device in ["XLA_GPU", "XLA_CPU"]:
-          rtol = 4e-4
-        self.assertAllClose(actual_variance, expected_variance, rtol=rtol)
+        self.assertAllClose(actual_variance, expected_variance, rtol=2*1e-3)
 
   def testShuffle1d(self):
     # TODO(b/26783907): this test requires the CPU backend to implement sort.
diff --git a/tensorflow/compiler/tests/reverse_ops_test.py b/tensorflow/compiler/tests/reverse_ops_test.py
index d01c676..32ab5d0 100644
--- a/tensorflow/compiler/tests/reverse_ops_test.py
+++ b/tensorflow/compiler/tests/reverse_ops_test.py
@@ -32,14 +32,20 @@
 
   def testReverseOneDim(self):
     shape = (7, 5, 9, 11)
-    for revdim in range(len(shape)):
+    for revdim in range(-len(shape), len(shape)):
       self._AssertReverseEqual([revdim], shape)
 
   def testReverseMoreThanOneDim(self):
     shape = (7, 5, 9, 11)
+    # The offset is used to test various (but not all) combinations of negative
+    # and positive axis indices that are guaranteed to not collide at the same
+    # index.
     for revdims in itertools.chain.from_iterable(
-        itertools.combinations(range(len(shape)), k)
-        for k in range(2, len(shape)+1)):
+        itertools.combinations(range(-offset,
+                                     len(shape) - offset), k)
+        for k in range(2,
+                       len(shape) + 1)
+        for offset in range(0, len(shape))):
       self._AssertReverseEqual(revdims, shape)
 
   def _AssertReverseEqual(self, revdims, shape):
@@ -50,15 +56,16 @@
         p = array_ops.placeholder(dtypes.int32, shape=shape)
         axis = constant_op.constant(
             np.array(revdims, dtype=np.int32),
-            shape=(len(revdims),), dtype=dtypes.int32)
+            shape=(len(revdims),),
+            dtype=dtypes.int32)
         rval = array_ops.reverse(p, axis).eval({p: pval})
 
         slices = [
-            slice(-1, None, -1) if d in revdims else slice(None)
-            for d in range(len(shape))]
-      self.assertEqual(
-          pval[slices].flatten().tolist(),
-          rval.flatten().tolist())
+            slice(-1, None, -1)
+            if d in revdims or d - len(shape) in revdims else slice(None)
+            for d in range(len(shape))
+        ]
+      self.assertEqual(pval[slices].flatten().tolist(), rval.flatten().tolist())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 61759fd..fda32c8 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -95,6 +95,10 @@
     name = "cpu_function_runtime",
     srcs = ["cpu_function_runtime.cc"],
     hdrs = ["cpu_function_runtime.h"],
+    visibility = [
+        "//tensorflow/compiler/aot:__pkg__",
+        "//tensorflow/compiler/xla/service/cpu:__pkg__",
+    ],
     deps = [
         # Keep dependencies to a minimum here; this library is used in every AOT
         # binary produced by tfcompile.
@@ -144,6 +148,7 @@
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service/cpu:buffer_info_util",
         "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc
index 2ffad2a..fcc4095 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime.cc
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc
@@ -55,19 +55,26 @@
 }  // namespace
 
 namespace cpu_function_runtime {
-size_t AlignedBufferBytes(const intptr_t* sizes, size_t n) {
+size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n,
+                          bool allocate_entry_params) {
   size_t total = 0;
   for (size_t i = 0; i < n; ++i) {
-    if (sizes[i] > 0) {
-      total += align_to(sizes[i], kAlign);
+    bool should_allocate =
+        buffer_infos[i].is_temp_buffer() ||
+        (buffer_infos[i].is_entry_parameter() && allocate_entry_params);
+
+    if (should_allocate) {
+      total += align_to(buffer_infos[i].size(), kAlign);
     }
   }
   return total;
 }
 
-void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs,
+void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n,
+                              bool allocate_entry_params, void** bufs,
                               bool annotate_initialized) {
-  const size_t total = AlignedBufferBytes(sizes, n);
+  const size_t total =
+      AlignedBufferBytes(buffer_infos, n, allocate_entry_params);
   void* contiguous = nullptr;
   if (total > 0) {
     contiguous = aligned_malloc(total, kAlign);
@@ -79,13 +86,14 @@
   }
   uintptr_t pos = reinterpret_cast<uintptr_t>(contiguous);
   for (size_t i = 0; i < n; ++i) {
-    if (sizes[i] < 0) {
-      // bufs[i] is either a constant, an entry parameter or a thread local
-      // allocation.
-      bufs[i] = nullptr;
-    } else {
+    bool should_allocate =
+        buffer_infos[i].is_temp_buffer() ||
+        (buffer_infos[i].is_entry_parameter() && allocate_entry_params);
+    if (should_allocate) {
       bufs[i] = reinterpret_cast<void*>(pos);
-      pos += align_to(sizes[i], kAlign);
+      pos += align_to(buffer_infos[i].size(), kAlign);
+    } else {
+      bufs[i] = nullptr;
     }
   }
   return contiguous;
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.h b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
index c7b4559..dfc1e8b 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
@@ -18,29 +18,142 @@
 
 #include "tensorflow/core/platform/types.h"
 
+#include <cassert>
+
 namespace tensorflow {
 namespace cpu_function_runtime {
+// Stores information about one buffer used by an XLA:CPU compiled function.
+// These buffers are used for holding inputs to the computation, outputs from
+// the computation and as temporary scratch space.
+class BufferInfo {
+ public:
+  // Creates a BufferInfo from a serialized encoding generated by `Encode`.
+  explicit BufferInfo(std::pair<uint64, uint64> encoding)
+      : entry_param_number_(encoding.second) {
+    Kind kind;
+    uint64 size;
+    Unpack(encoding.first, &kind, &size);
+    kind_ = kind;
+    size_ = size;
+  }
+
+  // Returns true if this buffer stores a constant.  These never need to be
+  // allocated by the runtime.
+  bool is_constant() const { return kind() == Kind::kConstant; }
+
+  // Returns true if this buffer stores an entry parameter.  These may or may
+  // not need to be allocated by the runtime, depending on
+  // XlaCompiledCpuFunction::AllocMode.
+  bool is_entry_parameter() const { return kind() == Kind::kEntryParameter; }
+
+  // Returns the entry parameter number of this buffer.
+  uint64 entry_parameter_number() const {
+    assert(is_entry_parameter());
+    return entry_param_number_;
+  }
+
+  // Returns true if this buffer is temporary scratch space required by the XLA
+  // computations.  These are always allocated by the runtime.
+  bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; }
+
+  // Returns true if this buffer is allocated on the C stack or into registers.
+  // These buffers are never allocated by the runtime.
+  bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; }
+
+  // Returns the size for this buffer.
+  uint64 size() const { return size_; }
+
+  // Encodes this BufferInfo into two 64 bit integers that can be used to
+  // reconstruct the BufferInfo later using the constructor.  We need this
+  // because we use BufferInfo in places where using protocol buffers would
+  // negatively impact binary size.
+  std::pair<uint64, uint64> Encode() const {
+    static_assert(sizeof(*this) == 16, "");
+    uint64 upper = Pack(kind(), size_);
+    uint64 lower = entry_param_number_;
+    return {upper, lower};
+  }
+
+  bool operator==(const BufferInfo& buffer_info) const {
+    if (kind() != buffer_info.kind() || size() != buffer_info.size()) {
+      return false;
+    }
+    return !is_entry_parameter() ||
+           entry_parameter_number() == buffer_info.entry_parameter_number();
+  }
+
+  // Factory methods:
+
+  static BufferInfo MakeTempBuffer(uint64 size) {
+    return BufferInfo(Kind::kTempBuffer, /*size=*/size,
+                      /*entry_param_number=*/-1);
+  }
+  static BufferInfo MakeConstant(uint64 size) {
+    return BufferInfo(Kind::kConstant, /*size=*/size,
+                      /*entry_param_number=*/-1);
+  }
+  static BufferInfo MakeEntryParameter(uint64 size, uint64 param_number) {
+    return BufferInfo(Kind::kEntryParameter, /*size=*/size,
+                      /*entry_param_number=*/param_number);
+  }
+  static BufferInfo MakeOnStackBuffer(uint64 size) {
+    return BufferInfo(Kind::kOnStackBuffer, /*size=*/size,
+                      /*entry_param_number=*/-1);
+  }
+
+ private:
+  BufferInfo() = default;
+
+  enum class Kind : unsigned {
+    kConstant,
+    kTempBuffer,
+    kEntryParameter,
+    kOnStackBuffer
+  };
+
+  Kind kind() const { return static_cast<Kind>(kind_); }
+
+  explicit BufferInfo(Kind kind, uint64 size, uint64 entry_param_number)
+      : kind_(kind), size_(size), entry_param_number_(entry_param_number) {}
+
+  static uint64 Pack(Kind kind, uint64 size) {
+    return (static_cast<uint64>(size) << 2) | static_cast<uint64>(kind);
+  }
+
+  static void Unpack(uint64 packed, Kind* kind, uint64* size) {
+    *size = packed >> 2;
+    *kind = static_cast<Kind>((packed << 62) >> 62);
+  }
+
+  Kind kind_ : 2;
+  uint64 size_ : 62;
+  int64 entry_param_number_;
+};
 
 // Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
 constexpr size_t kAlign = 64;
 
-// AlignedBufferBytes returns the sum of each size in `sizes`, skipping -1
-// values.  There are `n` entries in `sizes`.  Each buffer is aligned to
-// kAlign byte boundaries.
-size_t AlignedBufferBytes(const intptr_t* sizes, size_t n);
+// AlignedBufferBytes returns the sum of the size of each buffer in
+// `buffer_infos`, skipping constants, on-stack buffers and, if
+// allocate_entry_params is false, entry parameters.  There are `n` entries in
+// `buffer_infos`.  Each buffer is aligned to kAlign byte boundaries.
+size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n,
+                          bool allocate_entry_params);
 
 // MallocContiguousBuffers allocates buffers for use by the entry point
-// generated by tfcompile.  `sizes` is an array of byte sizes for each buffer,
-// where -1 causes the buffer pointer to be nullptr.  There are `n` entries in
-// `sizes`.  If `annotate_initialized` is set, the allocated memory will be
-// annotated as having been initialized - this is useful when allocating
-// temporary buffers.
+// generated by tfcompile.  There are `n` entries in `buffer_infos`.  If
+// `annotate_initialized` is set, the allocated memory will be annotated as
+// having been initialized - this is useful when allocating temporary buffers.
+// If allocate_entry_params is true then allocates temp buffers and entry
+// parameters, otherwise allocated only temp buffers.  Slots in `bufs`
+// corresponding to unallocated buffers are set to nullptr.
 //
 // A single contiguous block of memory is allocated, and portions of it are
 // parceled out into `bufs`, which must have space for `n` entries.  Returns
 // the head of the allocated contiguous block, which should be passed to
 // FreeContiguous when the buffers are no longer in use.
-void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs,
+void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n,
+                              bool allocate_entry_params, void** bufs,
                               bool annotate_initialized);
 
 // FreeContiguous frees the contiguous block of memory allocated by
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
index f4f27a1..8ca628c 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc
@@ -21,6 +21,8 @@
 namespace tensorflow {
 namespace {
 
+using cpu_function_runtime::BufferInfo;
+
 TEST(XlaCompiledCpuFunctionTest, AlignmentValue) {
   // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
@@ -30,20 +32,51 @@
   EXPECT_EQ(cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment);
 }
 
+std::vector<BufferInfo> SizesToBufferInfos(const intptr_t* sizes, size_t n) {
+  std::vector<BufferInfo> buffer_infos;
+  std::transform(sizes, sizes + n, std::back_inserter(buffer_infos),
+                 [&](intptr_t size) {
+                   if (size == -1) {
+                     // Use a dummy on-stack buffer allocation to indicat the
+                     // the current slot does not need an allocation.
+                     int64 on_stack_buffer_size = 4;
+                     return BufferInfo::MakeOnStackBuffer(on_stack_buffer_size);
+                   }
+                   return BufferInfo::MakeTempBuffer(size);
+                 });
+  return buffer_infos;
+}
+
+// Simple wrappers to make writing tests more ergonomic.
+
+size_t AlignedBufferBytesFromSizes(const intptr_t* sizes, size_t n) {
+  std::vector<BufferInfo> buffer_infos = SizesToBufferInfos(sizes, n);
+  return AlignedBufferBytes(buffer_infos.data(), n,
+                            /*allocate_entry_params=*/false);
+}
+
+void* MallocContiguousBuffersFromSizes(const intptr_t* sizes, size_t n,
+                                       void** bufs, bool annotate_initialized) {
+  std::vector<BufferInfo> buffer_infos = SizesToBufferInfos(sizes, n);
+  return MallocContiguousBuffers(buffer_infos.data(), n,
+                                 /*allocate_entry_params=*/false, bufs,
+                                 annotate_initialized);
+}
+
 TEST(XlaCompiledCpuFunctionTest, AlignedBufferBytes) {
-  EXPECT_EQ(cpu_function_runtime::AlignedBufferBytes(nullptr, 0), 0);
+  EXPECT_EQ(AlignedBufferBytesFromSizes(nullptr, 0), 0);
 
   static constexpr intptr_t sizesA[1] = {-1};
-  EXPECT_EQ(cpu_function_runtime::AlignedBufferBytes(sizesA, 1), 0);
+  EXPECT_EQ(AlignedBufferBytesFromSizes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(cpu_function_runtime::AlignedBufferBytes(sizesB, 1), 64);
+  EXPECT_EQ(AlignedBufferBytesFromSizes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(cpu_function_runtime::AlignedBufferBytes(sizesC, 1), 64);
+  EXPECT_EQ(AlignedBufferBytesFromSizes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(cpu_function_runtime::AlignedBufferBytes(sizesD, 7), 320);
+  EXPECT_EQ(AlignedBufferBytesFromSizes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -56,15 +89,14 @@
 // free.  We also check the contiguous property.
 TEST(XlaCompiledCpuFunctionTest, MallocFreeContiguousBuffers) {
   // Test empty sizes.
-  void* base =
-      cpu_function_runtime::MallocContiguousBuffers(nullptr, 0, nullptr, false);
+  void* base = MallocContiguousBuffersFromSizes(nullptr, 0, nullptr, false);
   EXPECT_EQ(base, nullptr);
   cpu_function_runtime::FreeContiguous(base);
 
   // Test non-empty sizes with 0 sum.
   static constexpr intptr_t sizesA[1] = {-1};
   void* bufA[1];
-  base = cpu_function_runtime::MallocContiguousBuffers(sizesA, 1, bufA, false);
+  base = MallocContiguousBuffersFromSizes(sizesA, 1, bufA, false);
   EXPECT_EQ(base, nullptr);
   EXPECT_EQ(bufA[0], nullptr);
   cpu_function_runtime::FreeContiguous(base);
@@ -72,7 +104,7 @@
   // Test non-empty sizes with non-0 sum.
   static constexpr intptr_t sizesB[1] = {3};
   void* bufB[1];
-  base = cpu_function_runtime::MallocContiguousBuffers(sizesB, 1, bufB, false);
+  base = MallocContiguousBuffersFromSizes(sizesB, 1, bufB, false);
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufB[0], add_ptr(base, 0));
   char* bufB0_bytes = static_cast<char*>(bufB[0]);
@@ -84,7 +116,7 @@
   // Test non-empty sizes with non-0 sum, and annotate_initialized.
   static constexpr intptr_t sizesC[1] = {3};
   void* bufC[1];
-  base = cpu_function_runtime::MallocContiguousBuffers(sizesC, 1, bufC, true);
+  base = MallocContiguousBuffersFromSizes(sizesC, 1, bufC, true);
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufC[0], add_ptr(base, 0));
   char* bufC0_bytes = static_cast<char*>(bufC[0]);
@@ -96,7 +128,7 @@
   // Test mixed sizes.
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
   void* bufD[7];
-  base = cpu_function_runtime::MallocContiguousBuffers(sizesD, 7, bufD, false);
+  base = MallocContiguousBuffersFromSizes(sizesD, 7, bufD, false);
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
@@ -117,5 +149,23 @@
   cpu_function_runtime::FreeContiguous(base);
 }
 
+void CheckRoundTripIsOk(const BufferInfo& buffer_info) {
+  BufferInfo round_trip(buffer_info.Encode());
+  ASSERT_EQ(round_trip, buffer_info);
+}
+
+TEST(XlaCompiledCpuFunctionTest, BufferInfoTest) {
+  CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(0));
+  CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(4));
+  CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(0));
+  CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(4));
+  CheckRoundTripIsOk(BufferInfo::MakeConstant(0));
+  CheckRoundTripIsOk(BufferInfo::MakeConstant(4));
+  CheckRoundTripIsOk(
+      BufferInfo::MakeEntryParameter(/*size=*/0, /*param_number=*/4));
+  CheckRoundTripIsOk(
+      BufferInfo::MakeEntryParameter(/*size=*/4, /*param_number=*/0));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 3bfe745..b1366e9 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -6,6 +6,10 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 tf_kernel_library(
     name = "xla_ops",
@@ -154,8 +158,14 @@
         "//tensorflow/core/kernels:sparse_to_dense_op",
         "//tensorflow/core/kernels:stack_ops",
         "//tensorflow/core/kernels:training_ops",
-        "//tensorflow/core/kernels:transpose_op",
-    ],
+    ] + if_mkl(
+        [
+            "//tensorflow/core/kernels:mkl_transpose_op",
+        ],
+        [
+            "//tensorflow/core/kernels:transpose_op",
+        ],
+    ),
 )
 
 tf_kernel_library(
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 26fc162..276d744 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -65,6 +65,6 @@
   TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp);
 };
 
-REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes(), XlaArgOp);
+REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes().CompilationOnly(), XlaArgOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 5da7972..674720e 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -120,45 +120,30 @@
                  {expanded_filter_shape.dims() - 2});
 }
 
-// Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
-// zeros for the cross-depth filters. Used to build a depthwise convolution.
-xla::XlaOp ExpandFilterForDepthwiseConvolution(const TensorShape& filter_shape,
-                                               DataType dtype,
-                                               const xla::XlaOp& filter,
-                                               xla::XlaBuilder* builder) {
-  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
-  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
-  TensorShape expanded_filter_shape =
-      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+// Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to
+// build a depthwise convolution.
+xla::XlaOp ReshapeFilterForDepthwiseConvolution(const TensorShape& filter_shape,
+                                                const xla::XlaOp& filter) {
+  int64 input_feature_dim = filter_shape.dims() - 2;
+  int64 output_feature_dim = filter_shape.dims() - 1;
+  int64 depthwise_multiplier = filter_shape.dim_size(output_feature_dim);
+  int64 input_feature = filter_shape.dim_size(input_feature_dim);
 
   // Create a [H, W, ..., 1, N*M] reshape of the filter.
-  TensorShape implicit_broadcast_filter_shape = expanded_filter_shape;
-  implicit_broadcast_filter_shape.set_dim(
-      implicit_broadcast_filter_shape.dims() - 2, 1);
-  implicit_broadcast_filter_shape.set_dim(
-      implicit_broadcast_filter_shape.dims() - 1,
-      depthwise_multiplier * input_feature);
-  auto implicit_broadcast_filter =
-      xla::Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
-
-  // Broadcast the filter to  [H, W, ..., M, M*N].
-  auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder);
-  auto expanded_filter = xla::Add(implicit_broadcast_filter, expanded_zero);
-
-  // If the filter mask is set, choose the broadcasted filter, othwerwise,
-  // choose zero.
-  return xla::Select(CreateExpandedFilterMask(filter_shape, builder),
-                     expanded_filter, expanded_zero);
+  TensorShape implicit_broadcast_filter_shape = filter_shape;
+  implicit_broadcast_filter_shape.set_dim(input_feature_dim, 1);
+  implicit_broadcast_filter_shape.set_dim(output_feature_dim,
+                                          depthwise_multiplier * input_feature);
+  return xla::Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
 }
 
-// Inverse of ExpandFilterForDepthwiseConvolution.
+// Reduces the results of the convolution with an expanded filter to the
+// non-expanded filter.
 xla::XlaOp ContractFilterForDepthwiseBackprop(XlaOpKernelContext* ctx,
                                               const TensorShape& filter_shape,
                                               DataType dtype,
                                               const xla::XlaOp& filter_backprop,
                                               xla::XlaBuilder* builder) {
-  TensorShape expanded_filter_shape =
-      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   auto masked_expanded_filter = xla::Select(
       CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
       CreateExpandedZero(filter_shape, dtype, builder));
@@ -168,8 +153,7 @@
       // ExpandedZero guarantees that only one element is non zero, so there
       // cannot be accumulated precision error.
       xla::Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
-                  *ctx->GetOrCreateAdd(dtype),
-                  {expanded_filter_shape.dims() - 2}),
+                  *ctx->GetOrCreateAdd(dtype), {filter_shape.dims() - 2}),
       filter_shape.dim_sizes());
 }
 
@@ -245,15 +229,9 @@
                     "input and filter must have the same depth: ", in_depth,
                     " vs ", input_shape.dim_size(feature_dim)));
 
-    xla::XlaBuilder* b = ctx->builder();
-
     xla::XlaOp filter = ctx->Input(1);
-    TensorShape expanded_filter_shape = filter_shape;
     if (depthwise_) {
-      filter = ExpandFilterForDepthwiseConvolution(
-          filter_shape, ctx->input_type(0), filter, b);
-      expanded_filter_shape =
-          ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+      filter = ReshapeFilterForDepthwiseConvolution(filter_shape, filter);
     }
 
     xla::ConvolutionDimensionNumbers dims;
@@ -280,14 +258,15 @@
       int64 unused_output_size;
       OP_REQUIRES_OK(
           ctx, GetWindowedOutputSizeVerboseV2(
-                   input_shape.dim_size(dim), expanded_filter_shape.dim_size(i),
+                   input_shape.dim_size(dim), filter_shape.dim_size(i),
                    rhs_dilation[i], window_strides[i], padding_,
                    &unused_output_size, &padding[i].first, &padding[i].second));
     }
 
-    xla::XlaOp conv =
-        xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
-                                lhs_dilation, rhs_dilation, dims);
+    xla::XlaOp conv = xla::ConvGeneralDilated(
+        ctx->Input(0), filter, window_strides, padding, lhs_dilation,
+        rhs_dilation, dims,
+        /*feature_group_count=*/depthwise_ ? in_depth : 1);
     ctx->SetOutput(0, conv);
   }
 
@@ -388,7 +367,6 @@
                        expanded_filter_shape, out_backprop_shape, dilations_,
                        strides_, padding_, data_format_, &dims));
 
-    xla::XlaBuilder* b = ctx->builder();
     auto filter = ctx->Input(1);
     auto out_backprop = ctx->Input(2);
 
@@ -425,12 +403,6 @@
       rhs_dilation[i] = dilations_[dim];
     }
 
-    // If this is a depthwise convolution, expand the filter.
-    if (depthwise_) {
-      filter = ExpandFilterForDepthwiseConvolution(
-          filter_shape, ctx->input_type(1), filter, b);
-    }
-
     // Mirror the filter in the spatial dimensions.
     xla::XlaOp mirrored_weights = xla::Rev(filter, kernel_spatial_dims);
 
@@ -438,7 +410,11 @@
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     xla::XlaOp in_backprop = xla::ConvGeneralDilated(
         out_backprop, mirrored_weights, /*window_strides=*/ones, padding,
-        lhs_dilation, rhs_dilation, dnums);
+        lhs_dilation, rhs_dilation, dnums,
+        /*feature_group_count=*/
+        depthwise_ ? out_backprop_shape.dim_size(feature_dim) /
+                         filter_shape.dim_size(num_spatial_dims_ + 1)
+                   : 1);
 
     ctx->SetOutput(0, in_backprop);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 35de96e..4414030 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -95,11 +95,11 @@
   //  operand = s32[3,3] parameter(0)
   //  indices = s32[2] parameter(1)
   //  gather = s32[3,2] gather(operand, indices),
-  //       output_window_dims={0},
-  //       elided_window_dims={1},
-  //       gather_dims_to_operand_dims={1},
+  //       offset_dims={0},
+  //       collapsed_slice_dims={1},
+  //       start_index_map={1},
   //       index_vector_dim=1,
-  //       window_bounds={3, 1}
+  //       slice_sizes={3, 1}
   //
   //
   // Example of an N-D gather pulling out slices of shape [1,1,2] out of a
@@ -108,42 +108,42 @@
   //  operand = s32[3,3,2] parameter(0)
   //  indices = s32[2,2] parameter(1)
   //  gather = s32[2,2] gather(operand, indices),
-  //       output_window_dims={1},
-  //       elided_window_dims={0,1},
-  //       gather_dims_to_operand_dims={0,1},
+  //       offset_dims={1},
+  //       collapsed_slice_dims={0,1},
+  //       start_index_map={0,1},
   //       index_vector_dim=0,
-  //       window_bounds={1,1,2}
+  //       slice_sizes={1,1,2}
 
   xla::GatherDimensionNumbers dim_numbers;
-  std::vector<int64> window_bounds;
-  window_bounds.reserve(input_shape.dims());
+  std::vector<int64> slice_sizes;
+  slice_sizes.reserve(input_shape.dims());
   for (int64 i = 0; i < input_shape.dims(); i++) {
     int64 window_bound;
     if (axis <= i && i < (axis + num_index_dims)) {
-      dim_numbers.add_elided_window_dims(i);
+      dim_numbers.add_collapsed_slice_dims(i);
       window_bound = 1;
     } else {
       window_bound = input_shape.dim_size(i);
     }
 
-    window_bounds.push_back(window_bound);
+    slice_sizes.push_back(window_bound);
 
     if (i < axis) {
-      dim_numbers.add_output_window_dims(i);
+      dim_numbers.add_offset_dims(i);
     } else if (i >= (axis + num_index_dims)) {
       int64 indices_rank =
           indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims();
-      dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims);
+      dim_numbers.add_offset_dims(i + indices_rank - num_index_dims);
     }
   }
 
   dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1)
                                                   : indices_shape.dims());
   for (int64 i = axis; i < axis + num_index_dims; i++) {
-    dim_numbers.add_gather_dims_to_operand_dims(i);
+    dim_numbers.add_start_index_map(i);
   }
 
-  *gather_output = xla::Gather(input, indices, dim_numbers, window_bounds);
+  *gather_output = xla::Gather(input, indices, dim_numbers, slice_sizes);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index ceb2af7..6a7eb8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -200,25 +200,35 @@
     }
   }
 
+  bool resource_variable_seen = false;
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    if (ctx->input_type(i) == DT_RESOURCE) {
+      resource_variable_seen = true;
+    } else {
+      OP_REQUIRES(
+          ctx, !resource_variable_seen,
+          errors::FailedPrecondition(
+              "Resource variables and regular inputs cannot be interleaved."));
+    }
+  }
+
   xla::XlaOp outputs = xla::Conditional(
       ctx->Input(0), xla::Tuple(b, inputs), *then_result.computation,
       xla::Tuple(b, inputs), *else_result.computation);
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
-    if (ctx->input_type(i) != DT_RESOURCE) {
-      xla::XlaOp output_handle = xla::GetTupleElement(outputs, i);
-      if (VLOG_IS_ON(2)) {
-        LOG(INFO) << "Setting output " << i;
-        auto shape_or = b->GetShape(output_handle);
-        if (shape_or.ok()) {
-          LOG(INFO) << "Shape for output " << i << ": "
-                    << xla::ShapeUtil::HumanString(shape_or.ValueOrDie());
-        } else {
-          LOG(INFO) << "Shape unknown for output " << i;
-        }
+    xla::XlaOp output_handle = xla::GetTupleElement(outputs, i);
+    if (VLOG_IS_ON(2)) {
+      LOG(INFO) << "Setting output " << i;
+      auto shape_or = b->GetShape(output_handle);
+      if (shape_or.ok()) {
+        LOG(INFO) << "Shape for output " << i << ": "
+                  << xla::ShapeUtil::HumanString(shape_or.ValueOrDie());
+      } else {
+        LOG(INFO) << "Shape unknown for output " << i;
       }
-      ctx->SetOutput(i, output_handle);
     }
+    ctx->SetOutput(i, output_handle);
   }
 
   // Updates the values of any resource variables modified by the conditional
@@ -247,6 +257,7 @@
 }
 
 REGISTER_XLA_OP(Name("If").AllowResourceTypes(), XlaIfOp);
+REGISTER_XLA_OP(Name("StatelessIf").AllowResourceTypes(), XlaIfOp);
 REGISTER_XLA_OP(Name("XlaIf").AllowResourceTypes(), XlaIfOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 1911e6e..64900e4 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -104,7 +104,7 @@
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_XLA_OP(Name("_Retval"), RetvalOp);
+REGISTER_XLA_OP(Name("_Retval").CompilationOnly(), RetvalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index d962ef4..c0afcca 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -95,10 +95,24 @@
     std::vector<int64> axes;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &axes));
 
+    // witnessed_axes is used to ensure that the same axis is not marked to be
+    // reversed multiple times.
+    gtl::InlinedVector<bool, 8> witnessed_axes(x_shape.dims(), false);
+
     for (int d = 0; d < axes.size(); ++d) {
-      OP_REQUIRES(ctx, (0 <= axes[d]) && (axes[d] < x_shape.dims()),
-                  errors::InvalidArgument(axes[d], " is out of range [0, ",
-                                          x_shape.dims(), ")."));
+      OP_REQUIRES(
+          ctx, (-x_shape.dims() <= axes[d]) && (axes[d] < x_shape.dims()),
+          errors::InvalidArgument(axes[d], " is out of range [-",
+                                  x_shape.dims(), ", ", x_shape.dims(), ")."));
+      // Axes can be negative and are shifted to the canonical index before
+      // being lowered to HLO.
+      if (axes[d] < 0) {
+        axes[d] += x_shape.dims();
+      }
+      OP_REQUIRES(ctx, !witnessed_axes[axes[d]],
+                  errors::InvalidArgument("canonicalized axis ", axes[d],
+                                          " was repeated."));
+      witnessed_axes[axes[d]] = true;
     }
 
     ctx->SetOutput(0, xla::Rev(ctx->Input(0), axes));
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 1233a37..2c7213f 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -70,7 +70,7 @@
     bool one_dimension_is_broadcasted_without_multiple = true;
     for (int i = 0; i < input_dims; ++i) {
       int multiple = literal.Get<int>({i});
-      OP_REQUIRES(ctx, multiple,
+      OP_REQUIRES(ctx, multiple >= 0,
                   errors::InvalidArgument("Expected multiples[", i,
                                           "] >= 0, but got ", multiple));
       int64 new_dim = input_shape.dim_size(i) * multiple;
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 1e8a376..2965182 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -301,6 +301,7 @@
 }
 
 REGISTER_XLA_OP(Name("While").AllowResourceTypes(), XlaWhileOp);
+REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes(), XlaWhileOp);
 REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 04fa101..febb638 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -57,7 +57,7 @@
     // We can grab entire blocks using gather
     if (n > block_size) {
       // Construct the starting indices of the diagonal blocks
-      auto gather_indices =
+      auto start_indices =
           Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
                                   xla::ConstantR0<int32>(builder, block_size)),
                               /*broadcast_sizes=*/{2}),
@@ -65,13 +65,13 @@
 
       // Gather the diagonal blocks
       xla::GatherDimensionNumbers dim_numbers;
-      dim_numbers.add_output_window_dims(ndims - 1);
-      dim_numbers.add_output_window_dims(ndims);
-      dim_numbers.add_gather_dims_to_operand_dims(ndims - 2);
-      dim_numbers.add_gather_dims_to_operand_dims(ndims - 1);
+      dim_numbers.add_offset_dims(ndims - 1);
+      dim_numbers.add_offset_dims(ndims);
+      dim_numbers.add_start_index_map(ndims - 2);
+      dim_numbers.add_start_index_map(ndims - 1);
       dim_numbers.set_index_vector_dim(1);
-      diag_blocks = Gather(a, gather_indices, dim_numbers,
-                           /*window_bounds=*/{block_size, block_size});
+      diag_blocks = Gather(a, start_indices, dim_numbers,
+                           /*slice_sizes=*/{block_size, block_size});
     }
 
     // The last block might be smaller than the block size,
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 2fb6691..77da1bf 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -32,6 +32,23 @@
   return Status::OK();
 }
 
+Status HostTensorToMutableBorrowingLiteral(
+    Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor->dtype(),
+                                           host_tensor->shape(), &xla_shape));
+  return HostTensorToMutableBorrowingLiteral(xla_shape, host_tensor, literal);
+}
+
+Status HostTensorToMutableBorrowingLiteral(
+    const xla::Shape& xla_shape, Tensor* host_tensor,
+    xla::MutableBorrowingLiteral* literal) {
+  *literal = xla::MutableBorrowingLiteral(
+      static_cast<const char*>(DMAHelper::base(host_tensor)), xla_shape);
+
+  return Status::OK();
+}
+
 Status HostTensorsToBorrowingLiteralTuple(
     tensorflow::gtl::ArraySlice<Tensor> host_tensors,
     xla::BorrowingLiteral* literal) {
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 0610a57..09d6fa8 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -30,6 +30,16 @@
 // 'host_tensor'.
 Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
                                     xla::BorrowingLiteral* literal);
+// Returns a MutableBorrowingLiteral that utilizes the same underlying buffer
+// owned by 'host_tensor', but is mutable via the xla::Literal methods.
+Status HostTensorToMutableBorrowingLiteral(
+    Tensor* host_tensor, xla::MutableBorrowingLiteral* literal);
+// Similar as above, except the literal shape is explicitly provided and used
+// instead of obtaining it from the 'host_tensor'. The provided literal shape
+// 'xla_shape' must be compatible with the shape of 'host_tensor'.
+Status HostTensorToMutableBorrowingLiteral(
+    const xla::Shape& xla_shape, Tensor* host_tensor,
+    xla::MutableBorrowingLiteral* literal);
 
 // Returns a BorrowingLiteral tuple that utilizes the same underlying buffers
 // owned by 'host_tensors'.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 9203e8d..0e07485 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
 #include <queue>
+#include <random>
 #include <set>
 #include <unordered_map>
 
@@ -297,4 +298,29 @@
   }
 }
 
+namespace {
+uint32 InitialRandomSeed() {
+  // Support plumbing the TF seed through to XLA is being worked on.
+  // If a user wants deterministic behavior, their best option
+  // is to start with a known checkpoint. This also handles issues when
+  // multiple random calls can be invoked in any order by TF executor.
+  // Another option is to use stateless random ops. They have much cleaner
+  // semantics.
+  // If a user really wants to set a deterministic seed for XLA-based
+  // devices, this is the place to do it.
+  std::random_device rd;
+  // Make the starting value odd.
+  return rd() | 1;
+}
+}  // namespace
+
+uint32 GetXLARandomSeed() {
+  // We initialize counter with an odd number and increment it by two
+  // everytime. This ensures that it will never be zero, even
+  // after an overflow. When seeded with zero, some XLA backends
+  // can return all zeros instead of random numbers.
+  static std::atomic<uint32> counter(InitialRandomSeed());
+  return counter.fetch_add(2);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index 745beb3..33620ef 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -56,6 +56,9 @@
 void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype,
                                    KernelDef* kdef);
 
+// Returns the next random seed to use for seeding xla rng.
+uint32 GetXLARandomSeed();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 3344591..1f0f240 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -14,7 +14,6 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
-#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
 
 #include <cassert>
 
@@ -22,61 +21,42 @@
 
 XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
                                                AllocMode alloc_mode)
-    : raw_function_(static_data.raw_function),
-      result_index_(static_data.result_index),
-      args_(new void*[static_data.num_args]),
-      temps_(new void*[static_data.num_temps]),
-      arg_index_to_temp_index_(new int32[static_data.num_args]),
-      num_args_(static_data.num_args),
-      arg_names_(static_data.arg_names),
-      result_names_(static_data.result_names),
-      program_shape_(static_data.program_shape),
-      hlo_profile_printer_data_(static_data.hlo_profile_printer_data) {
+    : raw_function_(static_data.raw_function_),
+      result_index_(static_data.result_index_),
+      buffer_table_(new void*[static_data.num_buffers_]),
+      buffer_infos_(static_data.buffer_infos_),
+      arg_index_table_(static_data.arg_index_table_),
+      num_args_(static_data.num_args_),
+      arg_names_(static_data.arg_names_),
+      result_names_(static_data.result_names_),
+      program_shape_(static_data.program_shape_),
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) {
+  bool allocate_entry_params =
+      alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS;
   // Allocate arg and temp buffers.
-  if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
-    alloc_args_ = cpu_function_runtime::MallocContiguousBuffers(
-        static_data.arg_sizes, static_data.num_args, args_,
-        /*annotate_initialized=*/false);
-  }
-  alloc_temps_ = cpu_function_runtime::MallocContiguousBuffers(
-      static_data.temp_sizes, static_data.num_temps, temps_,
+  alloc_buffer_table_ = cpu_function_runtime::MallocContiguousBuffers(
+      static_data.buffer_infos_, static_data.num_buffers_,
+      /*allocate_entry_params=*/allocate_entry_params, buffer_table_,
       /*annotate_initialized=*/true);
-
-  for (int i = 0; i < static_data.num_temps; i++) {
-    if (static_data.temp_sizes[i] < -1) {
-      int32 param_number = -(static_data.temp_sizes[i] + 2);
-      arg_index_to_temp_index_[param_number] = i;
-    }
-  }
-
   // If Hlo profiling is enabled the generated code expects an appropriately
   // sized buffer to be passed in as the last argument.  If Hlo profiling is
   // disabled the last function argument is still present in the function
   // signature, but it is ignored by the generated code and we pass in null for
   // it.
   if (hlo_profiling_enabled()) {
-    profile_counters_ = new int64[static_data.profile_counters_size]();
+    profile_counters_ = new int64[static_data.profile_counters_size_]();
   }
 }
 
 bool XlaCompiledCpuFunction::Run() {
-  // Propagate pointers to the argument buffers into the temps array.  Code
-  // generated by XLA discovers the incoming argument pointers from the temps
-  // array.
-  for (int32 i = 0; i < num_args_; i++) {
-    temps_[arg_index_to_temp_index_[i]] = args_[i];
-  }
-  raw_function_(temps_[result_index_], &run_options_, nullptr, temps_,
-                profile_counters_);
+  raw_function_(buffer_table_[result_index_], &run_options_, nullptr,
+                buffer_table_, profile_counters_);
   return true;
 }
 
 XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
-  cpu_function_runtime::FreeContiguous(alloc_args_);
-  cpu_function_runtime::FreeContiguous(alloc_temps_);
-  delete[] args_;
-  delete[] temps_;
-  delete[] arg_index_to_temp_index_;
+  cpu_function_runtime::FreeContiguous(alloc_buffer_table_);
+  delete[] buffer_table_;
   delete[] profile_counters_;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 27cfb35..425e769 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <string>
 
+#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -56,46 +57,85 @@
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
   // AOT this is backed by data compiled into the object file.
-  struct StaticData {
+  //
+  // The contents of StaticData are XLA-internal implementation details and
+  // should not be relied on by clients.
+  //
+  // TODO(sanjoy): Come up with a cleaner way to express the contraint we want
+  // here: generated XlaCompiledCpuFunction subclasses should be able to create
+  // instances of StaticData but only XlaCompiledCpuFunction should be able to
+  // read from StaticData instances.
+  class StaticData {
+   public:
+    void set_raw_function(RawFunction raw_function) {
+      raw_function_ = raw_function;
+    }
+    void set_buffer_infos(
+        const cpu_function_runtime::BufferInfo* buffer_infos) {
+      buffer_infos_ = buffer_infos;
+    }
+    void set_num_buffers(size_t num_buffers) { num_buffers_ = num_buffers; }
+    void set_arg_index_table(const int32* arg_index_table) {
+      arg_index_table_ = arg_index_table;
+    }
+    void set_num_args(int64 num_args) { num_args_ = num_args; }
+    void set_result_index(size_t result_index) { result_index_ = result_index; }
+    void set_arg_names(const char** arg_names) { arg_names_ = arg_names; }
+    void set_result_names(const char** result_names) {
+      result_names_ = result_names;
+    }
+    void set_program_shape(const xla::ProgramShape* program_shape) {
+      program_shape_ = program_shape;
+    }
+    const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
+      return hlo_profile_printer_data_;
+    }
+    void set_hlo_profile_printer_data(
+        const xla::HloProfilePrinterData* hlo_profile_printer_data) {
+      hlo_profile_printer_data_ = hlo_profile_printer_data;
+    }
+    void set_profile_counters_size(int64 profile_counters_size) {
+      profile_counters_size_ = profile_counters_size;
+    }
+
+   private:
     // The raw function to call.
-    RawFunction raw_function;
+    RawFunction raw_function_;
 
-    // Cardinality and size of arg buffers.
-    const intptr_t* arg_sizes = nullptr;
-    size_t num_args = 0;
+    // Contains information about the buffers used by the XLA computation.
+    const cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
+    size_t num_buffers_ = 0;
 
-    // Cardinality and size of temp buffers.
-    //
-    // If temp_sizes[i] >= 0 then the i'th temp is a regular temporary buffer.
-    //
-    // If temp_sizes[i] == -1 then the i'th temp is a constant buffer.  The
-    // corresponding entry in the temp buffer array needs to be set to null.
-    //
-    // If temp_sizes[i] < -1 then the i'th temp is the entry parameter
-    // -(temp_sizes[i] + 2).
-    const intptr_t* temp_sizes = nullptr;
-    size_t num_temps = 0;
+    // Entry parameter i is described by
+    // buffer_infos[arg_index_table[i]].
+    const int32* arg_index_table_ = nullptr;
+
+    // There are num_args entry parameters.
+    int64 num_args_ = 0;
 
     // The 0-based index of the result tuple, in the temp buffers.
-    size_t result_index = 0;
+    size_t result_index_ = 0;
 
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
-    const char** arg_names = nullptr;
-    const char** result_names = nullptr;
+    const char** arg_names_ = nullptr;
+    const char** result_names_ = nullptr;
 
     // [Optional] Arg and result shapes.
-    const xla::ProgramShape* program_shape = nullptr;
+    const xla::ProgramShape* program_shape_ = nullptr;
 
     // [Optional] Profile printer data.  Null if profiling is disabled.
-    const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr;
+    const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 
     // [Optional] The number of profile counters expected in the profile counter
     // buffer by the generated code and hlo_profile_printer.  0 if profiling is
     // disabled.  This information is already present in
     // hlo_profile_printer_data but xla::HloProfilePrinterData is forward
     // declared so we don't have access to that information here.
-    int64 profile_counters_size = 0;
+    int64 profile_counters_size_ = 0;
+
+    // Only XlaCompiledCpuFunction is allowed to read the above fields.
+    friend class XlaCompiledCpuFunction;
   };
 
   // AllocMode controls the buffer allocation mode.
@@ -135,14 +175,25 @@
   // ------------------------------
   // Arg methods for managing input buffers. Buffers are in row-major order.
 
-  // Returns the underlying array of argument buffers, where args()[I] is the
-  // buffer for the positional argument at index I.
-  void** args() { return args_; }
-  const void* const* args() const { return args_; }
-
   // Returns the buffer for the positional argument at the given `index`.
-  void* arg_data(size_t index) { return args_[index]; }
-  const void* arg_data(size_t index) const { return args_[index]; }
+  void* arg_data(size_t index) {
+    return buffer_table_[arg_index_table_[index]];
+  }
+  const void* arg_data(size_t index) const {
+    return buffer_table_[arg_index_table_[index]];
+  }
+
+  int num_args() const { return num_args_; }
+
+  // Returns the size of entry parameter `idx`.
+  //
+  // There is a static version of this method on tfcompile generated subclasses
+  // of XlaCompiledCpuFunction, but try to prefer this when possible since it
+  // works both for XlaJitCompiledCpuFunction and AOT compiled subclasses.
+  int arg_size(int idx) const {
+    assert(idx < num_args());
+    return buffer_infos_[arg_index_table_[idx]].size();
+  }
 
   // Sets the buffer for the positional argument at the given `index` to `data`.
   // Must be called before Run to have an effect. May be called under any
@@ -155,7 +206,9 @@
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
-  void set_arg_data(size_t index, void* data) { args_[index] = data; }
+  void set_arg_data(size_t index, void* data) {
+    buffer_table_[arg_index_table_[index]] = data;
+  }
 
   // ------------------------------
   // Result methods for managing output buffers. Buffers are in row-major order.
@@ -165,9 +218,9 @@
 
   // Returns the underlying array of result buffers, where results()[I] is the
   // buffer for the positional result at index I.
-  void** results() { return static_cast<void**>(temps_[result_index_]); }
+  void** results() { return static_cast<void**>(buffer_table_[result_index_]); }
   const void* const* results() const {
-    return static_cast<const void* const*>(temps_[result_index_]);
+    return static_cast<const void* const*>(buffer_table_[result_index_]);
   }
 
   // Profile counters for this XLA computation.
@@ -225,25 +278,28 @@
   const RawFunction raw_function_;
   const size_t result_index_;
 
-  // Arrays of argument and temp buffers; entries in args_ may be overwritten by
-  // the user.
-  void** args_ = nullptr;
-  void** temps_ = nullptr;
+  // Array containing pointers to argument and temp buffers (slots corresponding
+  // to constant and on-stack buffers are null).
+  void** const buffer_table_;
 
-  // Argument i needs to be placed in temps_[arg_index_to_temp_index_[i]] for
-  // XLA generated code to be able to find it.
+  // Describes the buffers used by the XLA computation.
+  const cpu_function_runtime::BufferInfo* const buffer_infos_;
+
+  // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
+  // for XLA generated code to be able to find it.
   //
   // For now we need to keep around the args_ array because there is code that
   // depends on args() returning a void**.  However, in the future we may remove
-  // args_ in favor of using temps_ as the sole storage for the arguments.
-  int32* arg_index_to_temp_index_;
+  // args_ in favor of using buffer_table_ as the sole storage for the
+  // arguments.
+  const int32* const arg_index_table_;
 
   // The number of incoming arguments.
-  int32 num_args_;
+  const int32 num_args_;
 
-  // Backing memory for individual arg and temp buffers.
-  void* alloc_args_ = nullptr;
-  void* alloc_temps_ = nullptr;
+  // Backing memory for buffer_table_ and args_, the latter depending on
+  // AllocMode.
+  void* alloc_buffer_table_ = nullptr;
 
   // Backing memory for profiling counters.
   int64* profile_counters_ = nullptr;
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 114a924..86a78ee 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -24,6 +24,7 @@
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -35,45 +36,6 @@
 namespace tensorflow {
 
 namespace {
-
-// Returns a vector of positional argument buffer sizes.
-xla::StatusOr<std::vector<intptr_t>> ComputeArgSizes(
-    const xla::ProgramShape& program_shape) {
-  std::vector<intptr_t> arg_sizes;
-  const size_t num_args = program_shape.parameters_size();
-  arg_sizes.reserve(num_args);
-  for (int i = 0; i < num_args; ++i) {
-    const xla::Shape& arg_shape = program_shape.parameters(i);
-    constexpr size_t kPointerSize = sizeof(void*);
-    arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
-  }
-  return std::move(arg_sizes);
-}
-
-// Returns a vector of positional temporary buffer sizes.
-xla::StatusOr<std::vector<intptr_t>> ComputeTempSizes(
-    const xla::BufferAssignment& buffer_assignment) {
-  const std::vector<xla::BufferAllocation>& allocations =
-      buffer_assignment.Allocations();
-  std::vector<intptr_t> temp_sizes;
-  temp_sizes.reserve(allocations.size());
-  for (const xla::BufferAllocation& allocation : allocations) {
-    if (allocation.is_constant() || allocation.is_thread_local()) {
-      // Constants are lowered to globals.  Thread locals are lowered to
-      // allocas.
-      temp_sizes.push_back(-1);
-    } else if (allocation.is_entry_computation_parameter()) {
-      // Entry computation parameters need some preprocessing in
-      // XlaCompiledCpuFunction::Run.  See the comment on
-      // XlaCompiledCpuFunction::StaticData::temp_sizes.
-      temp_sizes.push_back(-allocation.parameter_number() - 2);
-    } else {
-      temp_sizes.push_back(allocation.size());
-    }
-  }
-  return std::move(temp_sizes);
-}
-
 // Returns the index of the result in the temp buffers.
 xla::StatusOr<size_t> ComputeResultIndex(
     const xla::BufferAssignment& buffer_assignment) {
@@ -157,11 +119,11 @@
   const xla::BufferAssignment& buffer_assignment =
       cpu_executable->buffer_assignment();
 
-  // Compute buffer sizes and the result index, needed to run the raw function.
-  TF_ASSIGN_OR_RETURN(std::vector<intptr_t> arg_sizes,
-                      ComputeArgSizes(*program_shape));
-  TF_ASSIGN_OR_RETURN(std::vector<intptr_t> temp_sizes,
-                      ComputeTempSizes(buffer_assignment));
+  // Compute buffer infos and the result index, needed to run the raw function.
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos =
+      xla::cpu::CreateBufferInfosFromBufferAssignment(buffer_assignment);
+  std::vector<int32> arg_index_table =
+      xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
 
@@ -169,28 +131,28 @@
       new XlaJitCompiledCpuFunction);
   XlaJitCompiledCpuFunction* jit = jit_unique_ptr.get();
   jit->executable_ = std::move(executable);
-  jit->arg_sizes_ = std::move(arg_sizes);
-  jit->temp_sizes_ = std::move(temp_sizes);
+  jit->buffer_infos_ = std::move(buffer_infos);
+  jit->arg_index_table_ = std::move(arg_index_table);
   jit->program_shape_ = std::move(program_shape);
-  jit->static_data_.raw_function = std::move(raw_function);
-  jit->static_data_.arg_sizes = jit->arg_sizes_.data();
-  jit->static_data_.num_args = jit->arg_sizes_.size();
-  jit->static_data_.temp_sizes = jit->temp_sizes_.data();
-  jit->static_data_.num_temps = jit->temp_sizes_.size();
-  jit->static_data_.result_index = result_index;
+  jit->static_data_.set_raw_function(raw_function);
+  jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
+  jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
+  jit->static_data_.set_arg_index_table(jit->arg_index_table_.data());
+  jit->static_data_.set_num_args(jit->arg_index_table_.size());
+  jit->static_data_.set_result_index(result_index);
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
                &jit->result_names_);
-  jit->static_data_.arg_names = jit->arg_names_.data();
-  jit->static_data_.result_names = jit->result_names_.data();
-  jit->static_data_.program_shape = jit->program_shape_.get();
+  jit->static_data_.set_arg_names(jit->arg_names_.data());
+  jit->static_data_.set_result_names(jit->result_names_.data());
+  jit->static_data_.set_program_shape(jit->program_shape_.get());
 
   if (cpu_executable->hlo_profiling_enabled()) {
-    jit->static_data_.hlo_profile_printer_data =
-        &cpu_executable->hlo_profile_printer_data();
-    jit->static_data_.profile_counters_size =
-        cpu_executable->hlo_profile_printer_data().profile_counters_size();
+    jit->static_data_.set_hlo_profile_printer_data(
+        &cpu_executable->hlo_profile_printer_data());
+    jit->static_data_.set_profile_counters_size(
+        cpu_executable->hlo_profile_printer_data().profile_counters_size());
   }
 
   return std::move(jit_unique_ptr);
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index af307ae..d3c8f22 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -66,9 +66,11 @@
   // The static data is backed by the rest of the state in this class.
   XlaCompiledCpuFunction::StaticData static_data_;
 
-  // The backing arrays of arg and temp buffer sizes.
-  std::vector<intptr_t> arg_sizes_;
-  std::vector<intptr_t> temp_sizes_;
+  // The backing array for buffer infos.
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos_;
+
+  // The backing array for the arg index table.
+  std::vector<int32> arg_index_table_;
 
   // The backing arrays of arg and result names. We hold the actual strings in
   // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index ea75ad3..2d5d078 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -409,7 +409,7 @@
 
   // Returns the total number of elements in the array.
   int64 num_elements() const {
-    return std::accumulate(sizes_.begin(), sizes_.end(), 1,
+    return std::accumulate(sizes_.begin(), sizes_.end(), 1LL,
                            std::multiplies<int64>());
   }
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 8bf0773..a2f32ab 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -129,7 +129,7 @@
         ":arithmetic",
         ":constants",
         "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
     ],
 )
@@ -168,7 +168,7 @@
         ":numeric",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -185,7 +185,7 @@
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/pooling.h b/tensorflow/compiler/xla/client/lib/pooling.h
index 66d8006..1699c58 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.h
+++ b/tensorflow/compiler/xla/client/lib/pooling.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/sorting.h b/tensorflow/compiler/xla/client/lib/sorting.h
index 404b478..b9dfafd 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.h
+++ b/tensorflow/compiler/xla/client/lib/sorting.h
@@ -16,7 +16,7 @@
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index b6eee76..fef98c9 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -14,7 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 8a6c5fb..cffb24e 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -303,7 +303,7 @@
     const Shape& shape, int device_ordinal) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
-  auto literal = MakeUnique<Literal>();
+  auto literal = Literal::CreateFromShape(shape);
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromOutfeed(
       executor, shape, literal.get()));
   return std::move(literal);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 1cb61f7..4dffab3 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -45,21 +45,6 @@
   return id;
 }
 
-// Returns true if an instruction with the given opcode can be the root of the
-// computation.
-bool CanBeRoot(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kAfterAll:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
-    case HloOpcode::kOutfeed:
-    case HloOpcode::kTrace:
-      return false;
-    default:
-      return true;
-  }
-}
-
 }  // namespace
 
 XlaOp operator-(const XlaOp& x) { return Neg(x); }
@@ -142,28 +127,13 @@
   return ReportErrorOrReturn(op_creator());
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) const {
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
   TF_RETURN_IF_ERROR(first_error_);
-
-  TF_RET_CHECK(root_id != nullptr);
+  TF_RET_CHECK((root_id >= 0) && (root_id < instructions_.size()));
 
   ProgramShape program_shape;
 
-  // Not all instructions can be roots. Walk backwards from the last added
-  // instruction until a valid root is found.
-  int64 index = instructions_.size() - 1;
-  for (; index >= 0; index--) {
-    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
-                        StringToHloOpcode(instructions_[index].opcode()));
-    if (CanBeRoot(opcode)) {
-      break;
-    }
-  }
-  if (index < 0) {
-    return FailedPrecondition("no root instruction was found");
-  }
-  *root_id = instructions_[index].id();
-  *program_shape.mutable_result() = instructions_[index].shape();
+  *program_shape.mutable_result() = instructions_[root_id].shape();
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
@@ -188,8 +158,15 @@
 }
 
 StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
-  int64 root;
-  return GetProgramShape(&root);
+  TF_RET_CHECK(!instructions_.empty());
+  return GetProgramShape(instructions_.back().id());
+}
+
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
+  if (root.builder_ != this) {
+    return InvalidArgument("Given root operation is not in this computation.");
+  }
+  return GetProgramShape(root.handle());
 }
 
 void XlaBuilder::IsConstantVisitor(const int64 op_handle,
@@ -257,17 +234,29 @@
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
+  return Build(instructions_.back().id());
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root) {
+  if (root.builder_ != this) {
+    return InvalidArgument("Given root operation is not in this computation.");
+  }
+  return Build(root.handle());
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
+  if (!first_error_.ok()) {
+    string backtrace;
+    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
+    return AppendStatus(first_error_, backtrace);
+  }
 
   HloComputationProto entry;
   entry.set_id(GetUniqueId());  // Give the computation a global unique id.
   entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
 
-  {
-    int64 root_id;
-    TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(),
-                        GetProgramShape(&root_id));
-    entry.set_root_id(root_id);
-  }
+  TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id));
+  entry.set_root_id(root_id);
 
   for (auto& instruction : instructions_) {
     // Ensures that the instruction names are unique among the whole graph.
@@ -893,24 +882,28 @@
 
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
-                       Padding padding) {
+                       Padding padding, int64 feature_group_count) {
   return ConvWithGeneralDimensions(
       lhs, rhs, window_strides, padding,
-      CreateDefaultConvDimensionNumbers(window_strides.size()));
+      CreateDefaultConvDimensionNumbers(window_strides.size()),
+      feature_group_count);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count) {
   return ConvGeneral(lhs, rhs, window_strides, padding,
-                     CreateDefaultConvDimensionNumbers(window_strides.size()));
+                     CreateDefaultConvDimensionNumbers(window_strides.size()),
+                     feature_group_count);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
@@ -937,7 +930,7 @@
     return ConvGeneral(lhs, rhs, window_strides,
                        MakePadding(base_area_dimensions, window_dimensions,
                                    window_strides, padding),
-                       dimension_numbers);
+                       dimension_numbers, feature_group_count);
   });
 }
 
@@ -945,9 +938,10 @@
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
   return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
-                            dimension_numbers);
+                            dimension_numbers, feature_group_count);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -956,7 +950,8 @@
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
@@ -975,12 +970,13 @@
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
-                                           dimension_numbers));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferConvolveShape(
+                            lhs_shape, rhs_shape, instr.window(),
+                            dimension_numbers, feature_group_count));
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+    instr.set_feature_group_count(feature_group_count);
 
     return AddInstruction(std::move(instr), HloOpcode::kConvolution,
                           {lhs, rhs});
@@ -1084,6 +1080,23 @@
           "Replicated sharding is not yet supported for infeeds");
     }
 
+    // Infeed takes a single token operand. Generate the token to pass to the
+    // infeed.
+    XlaOp token;
+    auto make_token = [&]() {
+      HloInstructionProto token_instr;
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+      return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
+    };
+    if (sharding()) {
+      // Arbitrarily assign token to device 0.
+      OpSharding sharding = sharding_builder::AssignDevice(0);
+      XlaScopedShardingAssignment scoped_sharding(this, sharding);
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    } else {
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    }
+
     // The sharding is set by the client according to the data tuple shape.
     // However, the shape of the infeed instruction is a tuple containing the
     // data and a token. For tuple sharding type, the sharding must be changed
@@ -1099,11 +1112,11 @@
           sharding_builder::AssignDevice(0);
       XlaScopedShardingAssignment scoped_sharding(this,
                                                   infeed_instruction_sharding);
-      TF_ASSIGN_OR_RETURN(infeed,
-                          AddInstruction(std::move(instr), HloOpcode::kInfeed));
+      TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr),
+                                                 HloOpcode::kInfeed, {token}));
     } else {
-      TF_ASSIGN_OR_RETURN(infeed,
-                          AddInstruction(std::move(instr), HloOpcode::kInfeed));
+      TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr),
+                                                 HloOpcode::kInfeed, {token}));
     }
 
     // The infeed instruction produces a tuple of the infed data and a token
@@ -1169,8 +1182,15 @@
 
     instr.set_outfeed_config(outfeed_config);
 
+    // Outfeed takes a token as its second operand. Generate the token to pass
+    // to the outfeed.
+    HloInstructionProto token_instr;
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
+                                                    HloOpcode::kAfterAll, {}));
+
     TF_RETURN_IF_ERROR(
-        AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand})
+        AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand, token})
             .status());
 
     // The outfeed instruction produces a token. However, existing users expect
@@ -1611,27 +1631,27 @@
   });
 }
 
-XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
-                         tensorflow::gtl::ArraySlice<int64> window_bounds) {
+                         tensorflow::gtl::ArraySlice<int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
-    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
-                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
-        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
-                                         dimension_numbers, window_bounds));
+        ShapeInference::InferGatherShape(input_shape, start_indices_shape,
+                                         dimension_numbers, slice_sizes));
 
     *instr.mutable_gather_dimension_numbers() = dimension_numbers;
-    for (int64 bound : window_bounds) {
-      instr.add_gather_window_bounds(bound);
+    for (int64 bound : slice_sizes) {
+      instr.add_gather_slice_sizes(bound);
     }
 
     return AddInstruction(std::move(instr), HloOpcode::kGather,
-                          {input, gather_indices});
+                          {input, start_indices});
   });
 }
 
@@ -1892,6 +1912,61 @@
   });
 }
 
+XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension,
+                           int64 concat_dimension, int64 split_count,
+                           const std::vector<ReplicaGroup>& replica_groups) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+
+    // The HloInstruction for Alltoall currently only handles the data
+    // communication: it accepts N already split parts and scatters them to N
+    // cores, and each core gathers the N received parts into a tuple as the
+    // output. So here we explicitly split the operand before the hlo alltoall,
+    // and concat the tuple elements.
+    //
+    // First, run shape inference to make sure the shapes are valid.
+    TF_RETURN_IF_ERROR(
+        ShapeInference::InferAllToAllShape(operand_shape, split_dimension,
+                                           concat_dimension, split_count)
+            .status());
+
+    // Split into N parts.
+    std::vector<XlaOp> slices;
+    slices.reserve(split_count);
+    const int64 block_size =
+        operand_shape.dimensions(split_dimension) / split_count;
+    for (int i = 0; i < split_count; i++) {
+      slices.push_back(SliceInDim(operand, /*start_index=*/i * block_size,
+                                  /*limit_index=*/(i + 1) * block_size,
+                                  /*stride=*/1, /*dimno=*/split_dimension));
+    }
+
+    // Handle data communication.
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(auto slice_shapes, this->GetOperandShapes(slices));
+    std::vector<const Shape*> slice_shape_ptrs;
+    c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+    for (const ReplicaGroup& group : replica_groups) {
+      *instr.add_replica_groups() = group;
+    }
+    TF_ASSIGN_OR_RETURN(
+        XlaOp alltoall,
+        AddInstruction(std::move(instr), HloOpcode::kAllToAll, slices));
+
+    // Concat the N received parts.
+    std::vector<XlaOp> received;
+    received.reserve(split_count);
+    for (int i = 0; i < split_count; i++) {
+      received.push_back(this->GetTupleElement(alltoall, i));
+    }
+    return this->ConcatInDim(received, concat_dimension);
+  });
+}
+
 XlaOp XlaBuilder::SelectAndScatter(
     const XlaOp& operand, const XlaComputation& select,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
@@ -2163,11 +2238,6 @@
 
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       LookUpInstruction(root_op));
-  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
-  if (!CanBeRoot(opcode)) {
-    return InvalidArgument("the operand with opcode %s cannot be root",
-                           root->opcode().c_str());
-  }
 
   HloComputationProto entry;
   entry.set_id(GetUniqueId());  // Give the computation a global unique id.
@@ -2499,32 +2569,38 @@
 }
 
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return lhs.builder()->Conv(lhs, rhs, window_strides, padding);
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+           int64 feature_group_count) {
+  return lhs.builder()->Conv(lhs, rhs, window_strides, padding,
+                             feature_group_count);
 }
 
 XlaOp ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count) {
   return lhs.builder()->ConvWithGeneralPadding(lhs, rhs, window_strides,
-                                               padding);
+                                               padding, feature_group_count);
 }
 
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
   return lhs.builder()->ConvWithGeneralDimensions(lhs, rhs, window_strides,
-                                                  padding, dimension_numbers);
+                                                  padding, dimension_numbers,
+                                                  feature_group_count);
 }
 
 XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   tensorflow::gtl::ArraySlice<int64> window_strides,
                   tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers) {
+                  const ConvolutionDimensionNumbers& dimension_numbers,
+                  int64 feature_group_count) {
   return lhs.builder()->ConvGeneral(lhs, rhs, window_strides, padding,
-                                    dimension_numbers);
+                                    dimension_numbers, feature_group_count);
 }
 
 XlaOp ConvGeneralDilated(
@@ -2533,10 +2609,11 @@
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return lhs.builder()->ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                           lhs_dilation, rhs_dilation,
-                                           dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
+  return lhs.builder()->ConvGeneralDilated(
+      lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
+      dimension_numbers, feature_group_count);
 }
 
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
@@ -2693,6 +2770,13 @@
                                             replica_group_ids, channel_id);
 }
 
+XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+               int64 concat_dimension, int64 split_count,
+               const std::vector<ReplicaGroup>& replica_groups) {
+  return operand.builder()->AllToAll(operand, split_dimension, concat_dimension,
+                                     split_count, replica_groups);
+}
+
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -2822,11 +2906,11 @@
                                             mantissa_bits);
 }
 
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return input.builder()->Gather(input, gather_indices, dimension_numbers,
-                                 window_bounds);
+             tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return input.builder()->Gather(input, start_indices, dimension_numbers,
+                                 slice_sizes);
 }
 
 XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 8726cc6..469d504 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -195,9 +195,14 @@
 
   // Builds the computation with the requested operations, or returns a non-ok
   // status. Note that all ops that have been enqueued will be moved to the
-  // computation being returned.
+  // computation being returned. The root of the computation will be the last
+  // added operation.
   StatusOr<XlaComputation> Build();
 
+  // Overload of Build which specifies a particular root instruction for the
+  // computation.
+  StatusOr<XlaComputation> Build(XlaOp root);
+
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
   // This function is intended to be used where the returned XlaComputation is
@@ -225,9 +230,14 @@
   // Returns the shape of the given op.
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
-  // Returns the (inferred) result for the current computation's shape.
+  // Returns the (inferred) result for the current computation's shape. This
+  // assumes the root instruction is the last added instruction.
   StatusOr<ProgramShape> GetProgramShape() const;
 
+  // Returns the (inferred) result for the current computation's shape using the
+  // given operation as the root.
+  StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
+
   // Reports an error to the builder, by
   // * storing it internally and capturing a backtrace if it's the first error
   //   (this deferred value will be produced on the call to
@@ -255,6 +265,9 @@
   StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
  private:
+  // Build helper which takes the id of the root operation..
+  StatusOr<XlaComputation> Build(int64 root_id);
+
   // Enqueues a "retrieve parameter value" instruction for a parameter that was
   // passed to the computation.
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
@@ -499,22 +512,24 @@
   // Enqueues a convolution instruction onto the computation, which uses the
   // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-             tensorflow::gtl::ArraySlice<int64> window_strides,
-             Padding padding);
+             tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+             int64 feature_group_count = 1);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      int64 feature_group_count = 1);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration as well as the dimension numbers.
@@ -522,7 +537,8 @@
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration, dilation factors and dimension numbers.
@@ -532,7 +548,8 @@
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Enqueues an FFT instruction onto the computation, of the given type and
   // with the given FFT length.
@@ -686,9 +703,9 @@
   // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
   // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
   //
-  // - `channel_id`: for Allreduce nodes from different models, if they have the
-  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-  // applied cross models.
+  // - `channel_id`: for Allreduce nodes from different modules, if they have
+  // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
+  // not be applied cross modules.
   //
   // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
@@ -697,6 +714,13 @@
       const tensorflow::gtl::optional<ChannelHandle>& channel_id =
           tensorflow::gtl::nullopt);
 
+  // Enqueues an operation that do an Alltoall of the operand cross cores.
+  //
+  // TODO(b/110096724): This is NOT YET ready to use.
+  XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+                 int64 concat_dimension, int64 split_count,
+                 const std::vector<ReplicaGroup>& replica_groups);
+
   // Enqueues an operation that scatters the `source` array to the selected
   // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
@@ -853,9 +877,9 @@
                         const int mantissa_bits);
 
   // Enqueues a Gather node onto the computation.
-  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+  XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
-               tensorflow::gtl::ArraySlice<int64> window_bounds);
+               tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
@@ -969,9 +993,8 @@
   // shape.
   StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
 
-  // Returns the (inferred) result for the program shape for the current
-  // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+  // Returns the (inferred) result for the program shape using the given root.
+  StatusOr<ProgramShape> GetProgramShape(int64 root_id) const;
 
   // Returns shapes for the operands.
   StatusOr<std::vector<Shape>> GetOperandShapes(
@@ -1142,27 +1165,31 @@
                           const DotDimensionNumbers& dimension_numbers);
   friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
                     tensorflow::gtl::ArraySlice<int64> window_strides,
-                    Padding padding);
+                    Padding padding, int64 feature_group_count);
   friend XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      int64 feature_group_count);
   friend XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count);
   friend XlaOp ConvGeneral(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count);
   friend XlaOp ConvGeneralDilated(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count);
   friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
                    tensorflow::gtl::ArraySlice<int64> fft_length);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
@@ -1234,6 +1261,9 @@
       const XlaOp& operand, const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<int64> replica_group_ids,
       const tensorflow::gtl::optional<ChannelHandle>& channel_id);
+  friend XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+                        int64 concat_dimension, int64 split_count,
+                        const std::vector<ReplicaGroup>& replica_groups);
   friend XlaOp SelectAndScatter(
       const XlaOp& operand, const XlaComputation& select,
       tensorflow::gtl::ArraySlice<int64> window_dimensions,
@@ -1298,9 +1328,9 @@
                            const XlaComputation& false_computation);
   friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                const int mantissa_bits);
-  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+  friend XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                       const GatherDimensionNumbers& dimension_numbers,
-                      tensorflow::gtl::ArraySlice<int64> window_bounds);
+                      tensorflow::gtl::ArraySlice<int64> slice_sizes);
   friend XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                        const XlaOp& updates,
                        const XlaComputation& update_computation,
@@ -1624,28 +1654,32 @@
 // Enqueues a convolution instruction onto the computation, which uses the
 // default convolution dimension numbers.
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+           int64 feature_group_count = 1);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration in the format returned by MakePadding().
 XlaOp ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count = 1);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided dimension numbers configuration.
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count = 1);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration as well as the dimension numbers.
 XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   tensorflow::gtl::ArraySlice<int64> window_strides,
                   tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers);
+                  const ConvolutionDimensionNumbers& dimension_numbers,
+                  int64 feature_group_count = 1);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration, dilation factors and dimension numbers.
@@ -1655,7 +1689,8 @@
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count = 1);
 
 // Enqueues an FFT instruction onto the computation, of the given type and
 // with the given FFT length.
@@ -1820,9 +1855,9 @@
 // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
 // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
 //
-// - `channel_id`: for Allreduce nodes from different models, if they have the
+// - `channel_id`: for Allreduce nodes from different modules, if they have the
 // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-// applied cross models.
+// applied cross modules.
 //
 // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
 XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
@@ -1830,6 +1865,13 @@
                       const tensorflow::gtl::optional<ChannelHandle>&
                           channel_id = tensorflow::gtl::nullopt);
 
+// Enqueues an operation that do an Alltoall of the operand cross cores.
+//
+// TODO(b/110096724): This is NOT YET ready to use.
+XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
+               int64 concat_dimension, int64 split_count,
+               const std::vector<ReplicaGroup>& replica_groups = {});
+
 // Enqueues an operation that scatters the `source` array to the selected
 // indices of each window.
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
@@ -1982,9 +2024,9 @@
                       const int mantissa_bits);
 
 // Enqueues a Gather node onto the computation.
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds);
+             tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
 // Enqueues a Scatter node onto the computation.
 XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 28a207b..49a15ec 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -24,6 +24,7 @@
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -46,6 +47,17 @@
     return HloModule::CreateFromProto(proto, config);
   }
 
+  // Overload which explicitly specifies the root instruction.
+  StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b,
+                                                      XlaOp root) {
+    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root));
+    const HloModuleProto& proto = computation.proto();
+    TF_ASSIGN_OR_RETURN(const auto& config,
+                        HloModule::CreateModuleConfigFromProto(
+                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+    return HloModule::CreateFromProto(proto, config);
+  }
+
   // Returns the name of the test currently being run.
   string TestName() const {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
@@ -293,6 +305,21 @@
   EXPECT_THAT(root, op::Transpose(op::Parameter()));
 }
 
+TEST_F(XlaBuilderTest, AllToAll) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+  AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0,
+           /*split_count=*/2);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  // AllToAll is decomposed into slices -> all-to-all -> gte -> concat.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate);
+  EXPECT_EQ(root->operand(0)->operand(0)->opcode(), HloOpcode::kAllToAll);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {8, 8})));
+}
+
 TEST_F(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
@@ -320,5 +347,45 @@
   EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error"));
 }
 
+TEST_F(XlaBuilderTest, BuildWithSpecificRoot) {
+  XlaBuilder b(TestName());
+  XlaOp constant = ConstantR0<float>(&b, 1.0);
+  Add(constant, ConstantR0<float>(&b, 2.0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/constant));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+}
+
+TEST_F(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) {
+  // Specifying a particular root in Build should still include all entry
+  // parameters.
+  XlaBuilder b(TestName());
+  const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
+  XlaOp x = Parameter(&b, 0, shape, "x");
+  XlaOp y = Parameter(&b, 1, shape, "y");
+  XlaOp z = Parameter(&b, 2, shape, "z");
+  Add(x, Sub(y, z));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/x));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Parameter());
+  EXPECT_EQ(module->entry_computation()->num_parameters(), 3);
+  EXPECT_EQ(module->entry_computation()->instruction_count(), 5);
+}
+
+TEST_F(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
+  XlaBuilder b(TestName());
+  XlaBuilder other_b(TestName());
+  const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
+
+  Parameter(&b, 0, shape, "param");
+  XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
+
+  Status status = b.Build(other_param).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("root operation is not in this computation"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
deleted file mode 100644
index 2e131db..0000000
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Description:
-#   The new XLA client libraries.
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
-
-package_group(
-    name = "friends",
-    includes = [
-        "//tensorflow/compiler/xla:friends",
-    ],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "xla_builder",
-    hdrs = ["xla_builder.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/client:xla_builder",
-    ],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index f42fb92..5d27e4a 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -31,7 +31,6 @@
 std::once_flag flags_init;
 
 void SetDebugOptionsDefaults(DebugOptions* flags) {
-  flags->set_xla_enable_fast_math(true);
   flags->set_xla_llvm_enable_alias_scope_metadata(true);
   flags->set_xla_llvm_enable_noalias_metadata(true);
   flags->set_xla_llvm_enable_invariant_load_metadata(true);
@@ -53,6 +52,11 @@
   // the heuristics needed to decide when to run on multiple streams.  See
   // b/77879207.
   flags->set_xla_gpu_disable_multi_streaming(true);
+
+  // TODO(jlebar): Disable fastmath once doing so is not a performance
+  // regression.
+  flags->set_xla_cpu_enable_fast_math(true);
+  flags->set_xla_gpu_enable_fast_math(true);
 }
 
 // Allocates flag_values and flag_objects; this function must not be called more
@@ -150,10 +154,16 @@
           flag_values->mutable_xla_generate_hlo_text_to(),
           "Dump all HLO modules as text into the provided directory path."),
       tensorflow::Flag(
-          "xla_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_enable_fast_math),
-          flag_values->xla_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the compiler; "
+          "xla_cpu_enable_fast_math",
+          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
+          flag_values->xla_cpu_enable_fast_math(),
+          "Enable unsafe fast-math optimizations in the CPU compiler; "
+          "this may produce faster code at the expense of some accuracy."),
+      tensorflow::Flag(
+          "xla_gpu_enable_fast_math",
+          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
+          flag_values->xla_cpu_enable_fast_math(),
+          "Enable unsafe fast-math optimizations in the GPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
       tensorflow::Flag(
           "xla_llvm_enable_alias_scope_metadata",
@@ -306,6 +316,13 @@
                        bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
                        flag_values->xla_cpu_use_mkl_dnn(),
                        "Generate calls to MKL-DNN in the CPU backend."),
+      tensorflow::Flag(
+          "xla_gpu_crash_on_verification_failures",
+          bool_setter_for(
+              &DebugOptions::set_xla_gpu_crash_on_verification_failures),
+          flag_values->xla_gpu_crash_on_verification_failures(),
+          "Crashes the program on extra verification failures, e.g. cuDNN "
+          "cross checking failures"),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 0545deb..36e4725 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -71,7 +71,7 @@
   return out;
 }
 
-Literal::StrideConfig::StrideConfig(
+MutableLiteralBase::StrideConfig::StrideConfig(
     const Shape& source_shape, const Shape& dest_shape,
     tensorflow::gtl::ArraySlice<int64> dimensions)
     : dimensions(dimensions),
@@ -133,7 +133,8 @@
 }
 
 Literal::Literal(const Shape& shape, bool allocate_arrays)
-    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(shape);
   CHECK(LayoutUtil::HasLayout(*shape_));
   root_piece_ = new Piece();
   root_piece_->set_subshape(shape_.get());
@@ -159,7 +160,9 @@
       });
 }
 
-Literal::Literal(Literal&& other) : LiteralBase() { *this = std::move(other); }
+Literal::Literal(Literal&& other) : MutableLiteralBase() {
+  *this = std::move(other);
+}
 
 Literal& Literal::operator=(Literal&& other) {
   DCHECK(&other.root_piece_->subshape() == other.shape_.get());
@@ -187,12 +190,13 @@
   return piece(shape_index).sparse_indices();
 }
 
-SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) {
+SparseIndexArray* MutableLiteralBase::sparse_indices(
+    const ShapeIndex& shape_index) {
   return piece(shape_index).sparse_indices();
 }
 
 template <typename NativeT>
-Status Literal::CopySliceFromInternal(
+Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
     tensorflow::gtl::ArraySlice<int64> dest_base,
     tensorflow::gtl::ArraySlice<int64> copy_size) {
@@ -225,8 +229,8 @@
     // proper stride size at the matching dimension.
     DimensionVector src_indexes(src_base.size(), 0);
     DimensionVector dest_indexes(dest_base.size(), 0);
-    Literal::StrideConfig stride_config(src_literal.shape(), shape(),
-                                        copy_size);
+    MutableLiteralBase::StrideConfig stride_config(src_literal.shape(), shape(),
+                                                   copy_size);
 
     auto copy_proc = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
       // Map from multi-dimensional index, to source index.
@@ -253,9 +257,10 @@
   return Status::OK();
 }
 
-Status Literal::CopyElementFrom(const LiteralSlice& src_literal,
-                                tensorflow::gtl::ArraySlice<int64> src_index,
-                                tensorflow::gtl::ArraySlice<int64> dest_index) {
+Status MutableLiteralBase::CopyElementFrom(
+    const LiteralSlice& src_literal,
+    tensorflow::gtl::ArraySlice<int64> src_index,
+    tensorflow::gtl::ArraySlice<int64> dest_index) {
   DCHECK_EQ(shape().element_type(), src_literal.shape().element_type());
   const int64 src_linear_index = IndexUtil::MultidimensionalIndexToLinearIndex(
       src_literal.shape(), src_index);
@@ -275,8 +280,8 @@
   return Status::OK();
 }
 
-/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateFromProto(
-    const LiteralProto& proto) {
+/* static */ StatusOr<std::unique_ptr<Literal>>
+MutableLiteralBase::CreateFromProto(const LiteralProto& proto) {
   if (!proto.has_shape()) {
     return InvalidArgument("LiteralProto has no shape");
   }
@@ -405,9 +410,9 @@
   return Status::OK();
 }
 
-Status Literal::CopyFrom(const LiteralSlice& src_literal,
-                         const ShapeIndex& dest_shape_index,
-                         const ShapeIndex& src_shape_index) {
+Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
+                                    const ShapeIndex& dest_shape_index,
+                                    const ShapeIndex& src_shape_index) {
   const Shape& dest_subshape =
       ShapeUtil::GetSubshape(shape(), dest_shape_index);
   const Shape& src_subshape =
@@ -482,10 +487,11 @@
   return Status::OK();
 }
 
-Status Literal::CopySliceFrom(const LiteralSlice& src_literal,
-                              tensorflow::gtl::ArraySlice<int64> src_base,
-                              tensorflow::gtl::ArraySlice<int64> dest_base,
-                              tensorflow::gtl::ArraySlice<int64> copy_size) {
+Status MutableLiteralBase::CopySliceFrom(
+    const LiteralSlice& src_literal,
+    tensorflow::gtl::ArraySlice<int64> src_base,
+    tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
   TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape());
   TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape()))
       << ShapeUtil::HumanString(src_literal.shape());
@@ -543,7 +549,7 @@
       shape().element_type());
 }
 
-void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
+void MutableLiteralBase::PopulateR1(const tensorflow::core::Bitmap& values) {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 1);
   CHECK_EQ(element_count(), values.bits());
@@ -895,8 +901,8 @@
   return hash_value;
 }
 
-Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
-                                 int64 value) {
+Status MutableLiteralBase::SetIntegralAsS64(
+    tensorflow::gtl::ArraySlice<int64> multi_index, int64 value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
     case PRED:
@@ -933,7 +939,7 @@
   return p.sparse_indices()->At(sparse_element_number);
 }
 
-void Literal::SortSparseElements(const ShapeIndex& shape_index) {
+void MutableLiteralBase::SortSparseElements(const ShapeIndex& shape_index) {
   piece(shape_index).SortSparseElements();
 }
 
@@ -1391,11 +1397,11 @@
     elements.push_back(std::move(*new_element));
   }
   auto converted = MakeUnique<Literal>();
-  *converted = Literal::MoveIntoTuple(&elements);
+  *converted = MutableLiteralBase::MoveIntoTuple(&elements);
   return std::move(converted);
 }
 
-/* static */ Literal Literal::MoveIntoTuple(
+/* static */ Literal MutableLiteralBase::MoveIntoTuple(
     tensorflow::gtl::MutableArraySlice<Literal> elements) {
   std::vector<Shape> element_shapes;
   for (const Literal& element : elements) {
@@ -1808,7 +1814,8 @@
 }  // namespace
 
 Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
-  // These conditions should have been checked in Literal::CreateFromProto.
+  // These conditions should have been checked in
+  // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
   TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
   TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
@@ -1900,7 +1907,7 @@
   return piece(shape_index).untyped_data();
 }
 
-void* Literal::untyped_data(const ShapeIndex& shape_index) {
+void* MutableLiteralBase::untyped_data(const ShapeIndex& shape_index) {
   return piece(shape_index).untyped_data();
 }
 
@@ -1916,6 +1923,127 @@
                 ShapeUtil::ElementsIn(shape()));
 }
 
+void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
+                                               Piece* src_piece,
+                                               Piece* dest_piece) {
+  DCHECK(ShapeUtil::Equal(src_piece->subshape(), dest_piece->subshape()))
+      << "src_piece has shape: "
+      << ShapeUtil::HumanString(src_piece->subshape())
+      << "dest_piece has shape: "
+      << ShapeUtil::HumanString(dest_piece->subshape());
+  if (ShapeUtil::IsTuple(shape)) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      const Shape& subshape = shape.tuple_shapes(i);
+
+      auto child_piece = Piece();
+      child_piece.set_subshape(&subshape);
+
+      CopyPieceSubtree(subshape, &src_piece->child(i), &child_piece);
+
+      dest_piece->emplace_back(std::move(child_piece));
+    }
+  } else if (ShapeUtil::IsArray(shape)) {
+    dest_piece->set_buffer(src_piece->buffer());
+  } else {
+    // If the shape is neither an array nor tuple, then it must be
+    // zero-sized. Otherwise, some memory needs to be allocated for it.
+    CHECK_EQ(dest_piece->size_bytes(), 0);
+  }
+}
+
+MutableLiteralBase::~MutableLiteralBase() {}
+
+MutableBorrowingLiteral::MutableBorrowingLiteral(
+    const MutableBorrowingLiteral& literal)
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(literal.shape());
+  CHECK(LayoutUtil::HasLayout(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+
+  CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_);
+}
+
+MutableBorrowingLiteral& MutableBorrowingLiteral::operator=(
+    const MutableBorrowingLiteral& literal) {
+  shape_ = MakeUnique<Shape>(literal.shape());
+  CHECK(LayoutUtil::HasLayout(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+
+  CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_);
+
+  return *this;
+}
+
+MutableBorrowingLiteral::MutableBorrowingLiteral(
+    const MutableLiteralBase& literal)
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(literal.shape());
+  CHECK(LayoutUtil::HasLayout(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+
+  CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_);
+}
+
+MutableBorrowingLiteral::MutableBorrowingLiteral(MutableLiteralBase* literal)
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(literal->shape());
+  CHECK(LayoutUtil::HasLayout(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+
+  CopyPieceSubtree(*shape_, &literal->root_piece(), root_piece_);
+}
+
+MutableBorrowingLiteral::MutableBorrowingLiteral(
+    MutableBorrowingLiteral literal, const ShapeIndex& view_root)
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(literal.piece(view_root).subshape());
+  CHECK(LayoutUtil::HasLayout(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+
+  CopyPieceSubtree(*shape_, &literal.piece(view_root), root_piece_);
+}
+
+MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
+                                                 const Shape& shape)
+    : MutableLiteralBase() {
+  shape_ = MakeUnique<Shape>(shape);
+  CHECK(LayoutUtil::HasLayout(*shape_));
+  CHECK(!ShapeUtil::IsTuple(*shape_));
+
+  root_piece_ = new Piece();
+  root_piece_->set_buffer(const_cast<char*>(src_buf_ptr));
+  root_piece_->set_subshape(shape_.get());
+}
+
+MutableBorrowingLiteral::~MutableBorrowingLiteral() {
+  if (root_piece_ != nullptr) {
+    root_piece_->ForEachMutableSubpiece(
+        [&](const ShapeIndex& index, Piece* piece) {
+          if (piece->buffer() != nullptr) {
+            delete piece->sparse_indices();
+          }
+        });
+    delete root_piece_;
+  }
+}
+
+LiteralSlice::LiteralSlice(const LiteralBase& literal)
+    : LiteralBase(), root_piece_(&literal.root_piece()) {}
+
+LiteralSlice::LiteralSlice(const LiteralBase& literal,
+                           const ShapeIndex& view_root)
+    : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
+
 void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
   CHECK(ShapeUtil::IsTuple(shape));
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
@@ -1932,13 +2060,6 @@
   }
 }
 
-LiteralSlice::LiteralSlice(const LiteralBase& literal)
-    : LiteralBase(), root_piece_(&literal.root_piece()) {}
-
-LiteralSlice::LiteralSlice(const LiteralBase& literal,
-                           const ShapeIndex& view_root)
-    : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
-
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
     : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
   CHECK(ShapeUtil::IsArray(*shape_));
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index dd67dfa..92c0f90 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -310,9 +310,10 @@
   // type of literal itself (0 for numeric types, and false for predicates).
   //
   // Note: It's an antipattern to use this method then immediately call
-  // Literal::Populate on the result (since that results in zero initialization,
-  // then reinitialization. Conside if a call to MakeUnique<Literal>(shape),
-  // followed by the call to Literal::Populate can be used instead.
+  // MutableLiteralBase::Populate on the result (since that results in zero
+  // initialization, then reinitialization. Conside if a call to
+  // MakeUnique<Literal>(shape), followed by the call to
+  // MutableLiteralBase::Populate can be used instead.
   static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
 
  protected:
@@ -534,7 +535,7 @@
   virtual const Piece& root_piece() const = 0;
 
   // LiteralSlice and Literal must access Pieces of other Literals.
-  friend class Literal;
+  friend class MutableLiteralBase;
   friend class LiteralSlice;
   friend class BorrowingLiteral;
 
@@ -545,33 +546,10 @@
       tensorflow::gtl::ArraySlice<int64> start_indices) const;
 };
 
-// Class representing literal values in XLA.
-//
-// The underlying buffer and shape is always owned by this class.
-class Literal : public LiteralBase {
+// Abstract base class representing a mutable literal in XLA.
+class MutableLiteralBase : public LiteralBase {
  public:
-  Literal() : Literal(ShapeUtil::MakeNil()) {}
-
-  // Create a literal of the given shape. The literal is allocated sufficient
-  // memory to hold the shape. Memory is uninitialized.
-  explicit Literal(const Shape& shape);
-  virtual ~Literal();
-
-  // Literals are moveable, but not copyable. To copy a literal use
-  // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies
-  // of literals which can be expensive.
-  Literal(const Literal& other) = delete;
-  Literal& operator=(const Literal& other) = delete;
-  Literal(Literal&& other);
-  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
-  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
-  // to nullptr.
-  Literal(const Shape& shape, bool allocate_arrays);
-  Literal& operator=(Literal&& other);
-
-  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
-  // mutate the shape as this can produce malformed Literals.
-  Shape* mutable_shape_do_not_use() { return shape_.get(); }
+  virtual ~MutableLiteralBase() = 0;
 
   // Returns a MutableArraySlice view of the array for this literal for the
   // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
@@ -587,6 +565,10 @@
   // is not a sparse array.
   SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
 
+  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
+  // mutate the shape as this can produce malformed Literals.
+  Shape* mutable_shape_do_not_use() { return shape_.get(); }
+
   // Returns a pointer to the underlying buffer holding the array at the given
   // shape index. CHECKs if the subshape of the literal at the given ShapeIndex
   // is not array.
@@ -613,21 +595,6 @@
                   const ShapeIndex& dest_shape_index = {},
                   const ShapeIndex& src_shape_index = {});
 
-  // Returns a vector containing the tuple elements of this Literal as separate
-  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
-  // elements are moved into the new Literals; no data is copied. Upon return
-  // this Literal is set to a nil shape (empty tuple)
-  std::vector<Literal> DecomposeTuple();
-
-  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
-  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
-  // (layouts and shapes must match), but need not be arrays. The memory
-  // allocated in this literal for the subshape at dest_shape_index is
-  // deallocated, and the respective buffers are replaced with those in
-  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
-  Status MoveFrom(Literal&& src_literal,
-                  const ShapeIndex& dest_shape_index = {});
-
   // Copies the values from src_literal, starting at src_base shape indexes,
   // to this literal, starting at dest_base, where the copy size in each
   // dimension is specified by copy_size.
@@ -730,12 +697,7 @@
   static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
       const LiteralProto& proto);
 
- private:
-  // Recursively sets the subshapes and buffers of all subpieces rooted at
-  // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
-  // the shape.
-  void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays);
-
+ protected:
   // Returns the piece at the given ShapeIndex.
   Piece& piece(const ShapeIndex& shape_index) {
     return const_cast<Piece&>(LiteralBase::piece(shape_index));
@@ -783,12 +745,83 @@
   template <typename NativeT, typename FnType>
   Status PopulateInternal(const FnType& generator, bool parallel);
 
+  friend class LiteralBase;
+  friend class MutableBorrowingLiteral;
+};
+std::ostream& operator<<(std::ostream& out, const Literal& literal);
+
+// The underlying buffer and shape is always owned by this class.
+class Literal : public MutableLiteralBase {
+ public:
+  Literal() : Literal(ShapeUtil::MakeNil()) {}
+
+  // Create a literal of the given shape. The literal is allocated sufficient
+  // memory to hold the shape. Memory is uninitialized.
+  explicit Literal(const Shape& shape);
+  virtual ~Literal();
+
+  // Literals are moveable, but not copyable. To copy a literal use
+  // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies
+  // of literals which can be expensive.
+  Literal(const Literal& other) = delete;
+  Literal& operator=(const Literal& other) = delete;
+  Literal(Literal&& other);
+  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
+  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
+  // to nullptr.
+  Literal(const Shape& shape, bool allocate_arrays);
+  Literal& operator=(Literal&& other);
+
+  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
+  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
+  // (layouts and shapes must match), but need not be arrays. The memory
+  // allocated in this literal for the subshape at dest_shape_index is
+  // deallocated, and the respective buffers are replaced with those in
+  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
+  virtual Status MoveFrom(Literal&& src_literal,
+                          const ShapeIndex& dest_shape_index = {});
+
+  // Returns a vector containing the tuple elements of this Literal as separate
+  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
+  // elements are moved into the new Literals; no data is copied. Upon return
+  // this Literal is set to a nil shape (empty tuple)
+  std::vector<Literal> DecomposeTuple();
+
+ private:
   // Deallocate the buffers held by this literal.
   void DeallocateBuffers();
 
-  friend class LiteralBase;
+  // Recursively sets the subshapes and buffers of all subpieces rooted at
+  // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
+  // the shape.
+  void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays);
 };
-std::ostream& operator<<(std::ostream& out, const Literal& literal);
+
+// The underlying buffer is not owned by this class and is always owned by
+// others. The shape is not owned by this class and not mutable.
+class MutableBorrowingLiteral : public MutableLiteralBase {
+ public:
+  virtual ~MutableBorrowingLiteral();
+
+  MutableBorrowingLiteral() : MutableLiteralBase() {}
+
+  MutableBorrowingLiteral(const MutableBorrowingLiteral& literal);
+  MutableBorrowingLiteral& operator=(const MutableBorrowingLiteral& literal);
+
+  // Implicit conversion constructors.
+  MutableBorrowingLiteral(const MutableLiteralBase& literal);
+  MutableBorrowingLiteral(MutableLiteralBase* literal);
+  MutableBorrowingLiteral(MutableBorrowingLiteral literal,
+                          const ShapeIndex& view_root);
+  MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+
+ private:
+  // Recursively copies the subtree from the `src_piece` at the given child
+  // index to the `dest_piece`. For buffers only the pointers are copied, but
+  // not the content.
+  void CopyPieceSubtree(const Shape& shape, Piece* src_piece,
+                        Piece* dest_piece);
+};
 
 // A read-only view of a Literal. A LiteralSlice contains pointers to shape and
 // literal buffers always owned by others.
@@ -831,9 +864,9 @@
   const Piece& root_piece() const override { return root_piece_; };
   Piece root_piece_;
 
-  // Shape of this literal. Stored as unique_ptr so such that the (default)
-  // move construction of this class would be trivially correct: the pointer to
-  // Shape root_piece_ stores will still point to the correct address.
+  // Shape of this literal. Stored as unique_ptr such that the (default) move
+  // construction of this class would be trivially correct: the pointer to Shape
+  // root_piece_ stores will still point to the correct address.
   std::unique_ptr<Shape> shape_;
 };
 
@@ -886,7 +919,7 @@
 }
 
 template <typename NativeT>
-tensorflow::gtl::MutableArraySlice<NativeT> Literal::data(
+tensorflow::gtl::MutableArraySlice<NativeT> MutableLiteralBase::data(
     const ShapeIndex& shape_index) {
   return piece(shape_index).data<NativeT>();
 }
@@ -904,14 +937,15 @@
 }
 
 template <typename NativeT>
-inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         const ShapeIndex& shape_index, NativeT value) {
+inline void MutableLiteralBase::Set(
+    tensorflow::gtl::ArraySlice<int64> multi_index,
+    const ShapeIndex& shape_index, NativeT value) {
   return piece(shape_index).Set<NativeT>(multi_index, value);
 }
 
 template <typename NativeT>
-inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         NativeT value) {
+inline void MutableLiteralBase::Set(
+    tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value) {
   return root_piece().Set<NativeT>(multi_index, value);
 }
 
@@ -929,7 +963,7 @@
 }
 
 template <typename NativeT>
-void Literal::AppendSparseElement(
+void MutableLiteralBase::AppendSparseElement(
     tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value,
     const ShapeIndex& shape_index) {
   Piece& p = piece(shape_index);
@@ -959,7 +993,8 @@
 }
 
 template <typename NativeT>
-inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+inline void MutableLiteralBase::PopulateR1(
+    tensorflow::gtl::ArraySlice<NativeT> values) {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 1);
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
@@ -971,7 +1006,7 @@
 }
 
 template <typename NativeT>
-void Literal::PopulateR2(
+void MutableLiteralBase::PopulateR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 2);
@@ -996,7 +1031,7 @@
 }
 
 template <typename NativeT>
-void Literal::PopulateFromArray(const Array<NativeT>& values) {
+void MutableLiteralBase::PopulateFromArray(const Array<NativeT>& values) {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
@@ -1009,24 +1044,24 @@
 }
 
 template <typename NativeT>
-void Literal::PopulateR2FromArray2D(const Array2D<NativeT>& values) {
+void MutableLiteralBase::PopulateR2FromArray2D(const Array2D<NativeT>& values) {
   PopulateFromArray(values);
 }
 
 template <typename NativeT>
-void Literal::PopulateR3FromArray3D(const Array3D<NativeT>& values) {
+void MutableLiteralBase::PopulateR3FromArray3D(const Array3D<NativeT>& values) {
   PopulateFromArray(values);
 }
 
 template <typename NativeT>
-void Literal::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
+void MutableLiteralBase::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
   PopulateFromArray(values);
 }
 
 template <typename NativeT>
-void Literal::PopulateSparse(SparseIndexArray indices,
-                             tensorflow::gtl::ArraySlice<NativeT> values,
-                             bool sort) {
+void MutableLiteralBase::PopulateSparse(
+    SparseIndexArray indices, tensorflow::gtl::ArraySlice<NativeT> values,
+    bool sort) {
   CHECK(LayoutUtil::IsSparseArray(shape()));
   int rank = ShapeUtil::Rank(shape());
   CHECK_EQ(indices.rank(), rank);
@@ -1049,7 +1084,8 @@
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::PopulateInternal(const FnType& generator, bool parallel) {
+Status MutableLiteralBase::PopulateInternal(const FnType& generator,
+                                            bool parallel) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
@@ -1092,17 +1128,17 @@
   return Status::OK();
 }
 template <typename NativeT, typename FnType>
-Status Literal::Populate(const FnType& generator) {
+Status MutableLiteralBase::Populate(const FnType& generator) {
   return PopulateInternal<NativeT>(generator, /*parallel=*/false);
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::PopulateParallel(const FnType& generator) {
+Status MutableLiteralBase::PopulateParallel(const FnType& generator) {
   return PopulateInternal<NativeT>(generator, /*parallel=*/true);
 }
 
 template <typename NativeT>
-void Literal::PopulateWithValue(NativeT value) {
+void MutableLiteralBase::PopulateWithValue(NativeT value) {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 94993cc..6883a6b 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -38,7 +38,8 @@
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+Status CompareFloatsBitwiseEqual(
+    FloatT lhs, FloatT rhs, tensorflow::gtl::ArraySlice<int64> multi_index) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
   auto lhs_double = static_cast<double>(lhs);
@@ -46,9 +47,10 @@
   if (ulhs != urhs) {
     return InvalidArgument(
         "floating values are not bitwise-equal; and equality testing "
-        "was requested: %s=%g=%a vs %s=%g=%a",
+        "was requested: %s=%g=%a vs %s=%g=%a at index %s",
         StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, lhs_double,
-        StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double);
+        StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double,
+        LiteralUtil::MultiIndexAsString(multi_index).c_str());
   }
   return Status::OK();
 }
@@ -57,39 +59,48 @@
 // bitwise helper above (this is the un-specialized fallback, to just use the
 // default gunit implementation).
 template <typename NativeT>
-Status CompareEqual(NativeT lhs, NativeT rhs) {
+Status CompareEqual(NativeT lhs, NativeT rhs,
+                    tensorflow::gtl::ArraySlice<int64> multi_index) {
   if (lhs == rhs) {
     return Status::OK();
   }
-  return InvalidArgument("Expected equality of these values:\n  %s\n  %s",
-                         StrCat(lhs).c_str(), StrCat(rhs).c_str());
+  return InvalidArgument(
+      "Expected equality of these values:\n  %s\n  %s\nat index %s",
+      StrCat(lhs).c_str(), StrCat(rhs).c_str(),
+      LiteralUtil::MultiIndexAsString(multi_index).c_str());
 }
 
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
-Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
+Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
+                              tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
+Status CompareEqual<Eigen::half>(
+    Eigen::half lhs, Eigen::half rhs,
+    tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<float>(float lhs, float rhs) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
+Status CompareEqual<float>(float lhs, float rhs,
+                           tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<double>(double lhs, double rhs) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
+Status CompareEqual<double>(double lhs, double rhs,
+                            tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<complex64>(complex64 lhs, complex64 rhs) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real());
+Status CompareEqual<complex64>(complex64 lhs, complex64 rhs,
+                               tensorflow::gtl::ArraySlice<int64> multi_index) {
+  auto res = CompareEqual<float>(lhs.real(), rhs.real(), multi_index);
   if (!res.ok()) {
     return res;
   }
-  return CompareEqual<float>(lhs.imag(), rhs.imag());
+  return CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
 }
 
 // A recursive function which iterates through every index of expected and
@@ -102,7 +113,7 @@
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
-    return CompareEqual<NativeT>(expected_value, actual_value);
+    return CompareEqual<NativeT>(expected_value, actual_value, multi_index);
   }
 
   Status result;
@@ -720,12 +731,10 @@
     return Status::OK();
   }
 
-  return AppendStatus(result,
-                      tensorflow::strings::Printf(
-                          "\nat index: %s\nexpected: %s\nactual:   %s",
-                          LiteralUtil::MultiIndexAsString(multi_index).c_str(),
-                          ToStringTruncated(expected).c_str(),
-                          ToStringTruncated(actual).c_str()));
+  return AppendStatus(
+      result, tensorflow::strings::Printf("\nexpected: %s\nactual:   %s",
+                                          ToStringTruncated(expected).c_str(),
+                                          ToStringTruncated(actual).c_str()));
 }
 
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 356f12e..5d33df7 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -34,6 +34,7 @@
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 using tensorflow::strings::StrCat;
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 528b7fd..a65bdeb 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -570,7 +570,7 @@
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
@@ -613,6 +613,7 @@
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
     alwayslink = 1,
@@ -1233,6 +1234,20 @@
     ],
 )
 
+cc_library(
+    name = "scatter_expander",
+    srcs = ["scatter_expander.cc"],
+    hdrs = ["scatter_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":while_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
 tf_cc_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -1385,14 +1400,59 @@
 )
 
 cc_library(
+    name = "convolution_feature_group_converter",
+    srcs = ["convolution_feature_group_converter.cc"],
+    hdrs = ["convolution_feature_group_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "convolution_feature_group_converter_test",
+    size = "small",
+    srcs = ["convolution_feature_group_converter_test.cc"],
+    deps = [
+        ":convolution_feature_group_converter",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
+cc_library(
+    name = "while_loop_analysis",
+    srcs = ["while_loop_analysis.cc"],
+    hdrs = ["while_loop_analysis.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
     hdrs = ["while_loop_simplifier.h"],
     deps = [
         ":call_inliner",
         ":hlo",
-        ":hlo_evaluator",
         ":hlo_pass",
+        ":while_loop_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 946ef6f..f7812d9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1705,6 +1705,10 @@
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
+  if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
+    *operand->mutable_shape() = reshape->shape();
+    return ReplaceInstruction(reshape, operand);
+  }
 
   if (HloOpcode::kBroadcast == reshape->operand(0)->opcode()) {
     auto opt_dims = ReshapeLeavesDimensionsUnmodified(
@@ -1803,6 +1807,12 @@
 }
 
 Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
+  // TODO(b/112040122): Most of those optimizations can be done for multi-output
+  // reduces.
+  if (ShapeUtil::IsTuple(reduce->shape())) {
+    return Status::OK();
+  }
+
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
   tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
@@ -2138,6 +2148,11 @@
                                            transpose->dimensions())));
   }
 
+  if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
+    *operand->mutable_shape() = transpose->shape();
+    return ReplaceInstruction(transpose, operand);
+  }
+
   if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 862cbee..5837391 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1428,6 +1428,37 @@
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
+// Test transforming reshapes and transposes of rng.
+TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  HloInstruction* one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  HloInstruction* rng0 = builder.AddInstruction(
+      HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {2, 2}),
+                                RandomDistribution::RNG_UNIFORM, {zero, one}));
+
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(rng0->shape(), rng0, {1, 0}));
+  Shape reshape_shape = builder
+                            .AddInstruction(HloInstruction::CreateReshape(
+                                ShapeUtil::MakeShape(F32, {4}), transpose))
+                            ->shape();
+
+  auto computation = module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+
+  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // same shape as the reshape.
+  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
+                               reshape_shape));
+}
+
 // Test transforming reshapes to bitcasts under various conditions.
 TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   HloComputation::Builder builder(TestName());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 118a11c..cfd26fc 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -139,6 +139,7 @@
           case HloOpcode::kMap:
           case HloOpcode::kReduce:
           case HloOpcode::kReduceWindow:
+          case HloOpcode::kScatter:
           case HloOpcode::kSelectAndScatter:
           case HloOpcode::kFusion:
             // Map/reduce etc computations are always thread-local.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index a23427f..985ff30 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -61,6 +61,7 @@
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
+    case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kFusion:
       return CallContext::kParallel;
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 99abb9b..34f7fe1 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -48,11 +48,6 @@
 // compuation.
 using ObjectFileData = std::vector<char>;
 
-// Contains the buffer sizes information needed to allocate buffers to execute
-// an ahead-of-time computation.  Entries which contain -1 designate a parameter
-// which should be skipped over during allocation.
-using BufferSizes = std::vector<int64>;
-
 // Abstract superclass describing the result of an ahead-of-time compilation.
 class AotCompilationResult {
  public:
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
new file mode 100644
index 0000000..45252fc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// ConvolutionVisitor traverses the HLO computation and rewrites Convolution
+// operations with feature_group_count > 1 into convolutions with
+// feature_group_count = 1.
+class ConvolutionVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* convolution) override;
+
+  // Runs the visitor on a computation.
+  static bool Run(HloComputation* computation);
+
+  // Returns whether any convolution ops were rewritten.
+  const bool changed() const { return changed_; }
+
+  ~ConvolutionVisitor() override = default;
+
+ private:
+  explicit ConvolutionVisitor(HloComputation* computation)
+      : computation_(computation) {}
+
+  // Current HloComputation instance the ConvolutionVisitor is traversing.
+  HloComputation* computation_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+};
+
+bool ConvolutionVisitor::Run(HloComputation* computation) {
+  ConvolutionVisitor visitor(computation);
+  TF_CHECK_OK(computation->Accept(&visitor));
+  return visitor.changed_;
+}
+
+Shape ExpandedFilterShape(const Shape& shape, int64 group_count,
+                          int64 input_feature_dim) {
+  int64 num_dims = shape.dimensions_size();
+  CHECK_GE(num_dims, 2);
+  Shape expanded_shape = shape;
+  expanded_shape.set_dimensions(
+      input_feature_dim, shape.dimensions(input_feature_dim) * group_count);
+  return expanded_shape;
+}
+
+// Returns a vector with 'group_count' many groups, where the i-th group
+// consists of 'group_size' times the value i.
+std::vector<int32> GetMaskIds(int64 group_size, int64 group_count) {
+  std::vector<int32> values;
+  for (int i = 0; i < group_count; ++i) {
+    for (int j = 0; j < group_size; ++j) {
+      values.push_back(i);
+    }
+  }
+  return values;
+}
+
+// Create a mask for grouped convolution that will make a normal convolution
+// produce the same results as a grouped convolution. For a [2, 1, 6]
+// filter this returns a [2, 3, 6] mask
+//   1 1 0 0 0 0
+//   0 0 1 1 0 0
+//   0 0 0 0 1 1
+//
+//   1 1 0 0 0 0
+//   0 0 1 1 0 0
+//   0 0 0 0 1 1
+//
+// The first step is to create a rank 1 constant:
+//   0 1 2
+//
+// This is broadcasted to
+//   0 0 0 0 0 0
+//   1 1 1 1 1 1
+//   2 2 2 2 2 2
+//
+//   0 0 0 0 0 0
+//   1 1 1 1 1 1
+//   2 2 2 2 2 2
+//
+// Then we create another rank 1 constant
+//   0 0 1 1 2 2
+//
+// This is broadcasted to
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//
+// Finally we use the Eq op of these two broadcasted constants and get the
+// desired mask.
+HloInstruction* GetExpandedFilterMask(
+    const Shape& filter_shape, int64 input_feature_dim,
+    int64 output_feature_dim, int64 group_count,
+    const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+        add_instruction) {
+  Shape expanded_filter_shape =
+      ExpandedFilterShape(filter_shape, group_count, input_feature_dim);
+  Shape mask_shape = ShapeUtil::MakeShape(
+      S32, AsInt64Slice(expanded_filter_shape.dimensions()));
+  int64 output_feature = filter_shape.dimensions(output_feature_dim);
+  int64 group_size = filter_shape.dimensions(input_feature_dim);
+
+  // Create a 'input_feature' sized linspace and 'output_feature' sized linspace
+  // that will be broadcasted into perpendicular dimensions and compared.
+  const std::vector<int32> input_feature_filter_mask =
+      GetMaskIds(group_size, group_count);
+  const std::vector<int32> output_feature_filter_mask =
+      GetMaskIds(output_feature / group_count, group_count);
+
+  auto mask1 = add_instruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int32>(input_feature_filter_mask)));
+  auto broadcasted_mask1 = add_instruction(
+      HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim}));
+  auto mask2 = add_instruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int32>(output_feature_filter_mask)));
+  auto broadcasted_mask2 = add_instruction(
+      HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim}));
+
+  // Compare the broadcasted output feature linspace to the input feature
+  // linspace to create a diagonal predicate.
+  Shape predicate_shape = ShapeUtil::MakeShape(
+      PRED, AsInt64Slice(expanded_filter_shape.dimensions()));
+  return add_instruction(HloInstruction::CreateBinary(
+      predicate_shape, HloOpcode::kEq, broadcasted_mask1, broadcasted_mask2));
+}
+
+Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+  int64 group_count = convolution->feature_group_count();
+  if (group_count == 1) {
+    return Status::OK();
+  }
+  auto filter = convolution->mutable_operand(1);
+  changed_ = true;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    return computation_->AddInstruction(std::move(inst));
+  };
+
+  auto dim_numbers = convolution->convolution_dimension_numbers();
+  int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension();
+  int64 group_size = filter->shape().dimensions(input_feature_dim);
+  int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension();
+  auto expanded_filter_shape =
+      ExpandedFilterShape(filter->shape(), group_count, input_feature_dim);
+  HloInstruction* filter_mask = GetExpandedFilterMask(
+      filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
+  HloInstruction* expanded_filter;
+  // We want to repeat 'filter' in the 'input_feature_dim' dimension
+  // 'group_count' times.
+  if (group_size == 1) {
+    Shape reshaped_filter_shape =
+        ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
+    auto reshaped_filter =
+        add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+    std::vector<int64> broadcast_dims;
+    for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) {
+      if (i == input_feature_dim) {
+        continue;
+      }
+      broadcast_dims.push_back(i);
+    }
+    expanded_filter = add(HloInstruction::CreateBroadcast(
+        expanded_filter_shape, reshaped_filter, broadcast_dims));
+  } else {
+    // We could possibly also use reshape, broadcast, reshape instead of concat
+    // here, but it would require more complex code, and for depthwise
+    // convolution we would never end up in this branch.
+    std::vector<HloInstruction*> concat_operands(group_count, filter);
+    expanded_filter = add(HloInstruction::CreateConcatenate(
+        expanded_filter_shape, concat_operands, input_feature_dim));
+  }
+  auto zero = add(HloInstruction::CreateConstant(MakeUnique<Literal>(
+      LiteralUtil::Zero(expanded_filter_shape.element_type()))));
+  auto zero_filter =
+      add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+  auto new_filter = add(
+      HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect,
+                                    filter_mask, expanded_filter, zero_filter));
+  auto new_convolution = HloInstruction::CreateConvolve(
+      convolution->shape(), convolution->mutable_operand(0), new_filter,
+      convolution->window(), dim_numbers, /*feature_group_count=*/1);
+  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+      convolution, std::move(new_convolution)));
+  return Status::OK();
+}
+
+}  // namespace
+
+StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), before:\n" +
+                        module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (ConvolutionVisitor::Run(comp)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), after:\n" +
+                        module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
new file mode 100644
index 0000000..f213cc8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// A pass which rewrites convolutions with feature_group_count > 1 into
+// convolutions with feature_group_count = 1.
+class ConvolutionFeatureGroupConverter : public HloPassInterface {
+ public:
+  ConvolutionFeatureGroupConverter() {}
+
+  tensorflow::StringPiece name() const override {
+    return "convolution-feature-group-converter";
+  }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
new file mode 100644
index 0000000..28373eb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using ConvolutionFeatureGroupConverterTest = HloTestBase;
+namespace op = testing::opcode_matchers;
+
+TEST_F(ConvolutionFeatureGroupConverterTest,
+       ConvertFeatureGroupCountEqualToInputFeatureDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2,2] {
+  %input = f32[1,2,2]{2,1,0} parameter(0)
+  %copy = f32[1,2,2]{2,0,1} copy(f32[1,2,2]{2,1,0} %input)
+  %filter = f32[1,1,2]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,2]{2,0,1} %copy, f32[1,1,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  ConvolutionFeatureGroupConverter converter;
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure the convolution is converted to one with feature_group_count = 1.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->feature_group_count(), 1);
+  // Verify that the filter operand has been replaced.
+  EXPECT_THAT(root->operand(1),
+              op::Select(op::Eq(op::Broadcast(op::Constant()),
+                                op::Broadcast(op::Constant())),
+                         op::Broadcast(op::Reshape(op::Parameter())),
+                         op::Broadcast(op::Constant())));
+}
+
+TEST_F(ConvolutionFeatureGroupConverterTest,
+       ConvertFeatureGroupCountDivisorOfInputFeatureDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2,2] {
+  %input = f32[1,2,4]{2,1,0} parameter(0)
+  %copy = f32[1,2,4]{2,0,1} copy(f32[1,2,4]{2,1,0} %input)
+  %filter = f32[1,2,2]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,4]{2,0,1} %copy, f32[1,2,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  ConvolutionFeatureGroupConverter converter;
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure the convolution is converted to one with feature_group_count = 1.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->feature_group_count(), 1);
+  // Verify that the filter operand has been replaced.
+  EXPECT_THAT(root->operand(1),
+              op::Select(op::Eq(op::Broadcast(op::Constant()),
+                                op::Broadcast(op::Constant())),
+                         // We expect to see Concatenate here instead of
+                         // Broadcast, because feature_group_count < input
+                         // feature dimension.
+                         op::Concatenate(op::Parameter(), op::Parameter()),
+                         op::Broadcast(op::Constant())));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 36fb9b4..3e39c1b 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -312,7 +312,7 @@
   return Status::OK();
 }
 
-// We add copies for all the indices of the true and false computaiton roots,
+// We add copies for all the indices of the true and false computation roots,
 // in order to resolve interference. We later rely on the CopyRemover to drop
 // the unnecessary ones.
 Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
@@ -648,7 +648,12 @@
       //  We can only perform copy elision if the resulting merged values have
       //  totally ordered live ranges; otherwise the merged buffer would have
       //  live range interference.
-      if (IsHead(*dest)) {
+      if (src->next == dest) {
+        // In the process of eliding copies, its possible for a copy to have the
+        // same source and destination buffer. In this case, the copy can be
+        // safely removed.
+        VLOG(2) << copy->name() << " source and destination buffers are same.";
+      } else if (IsHead(*dest)) {
         // The copy copies an arbitrary value in the source buffer (call it s_x)
         // and defines d_0, the first value in the destination buffer. After
         // merging, the values in the combined buffer must be strictly ordered
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index cd73525..892d0d7 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2007,5 +2007,46 @@
   InsertCopies(module.get());
 }
 
+TEST_F(CopyInsertionTest, NestedWhiles) {
+  // Verify that only no unnecessary copies remain after copy insertion for
+  // trivial nested whiles (b/112472605).
+  const string& hlo_string = R"(
+HloModule TestModule
+
+cond.inner {
+  ROOT param.cond.inner = pred[] parameter(0)
+}
+
+body.inner {
+  param.body.inner = pred[] parameter(0)
+  ROOT neg = pred[] negate(param.body.inner)
+}
+
+cond.outer {
+  ROOT param.cond.outer = pred[] parameter(0)
+}
+
+body.outer {
+  param.cond.outer = pred[] parameter(0)
+  ROOT while = pred[] while(param.cond.outer), condition=cond.inner, body=body.inner
+}
+
+ENTRY TestComputation {
+  entry_param = pred[] parameter(0)
+  ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  InsertCopies(module.get());
+
+  // There should only be a single copy inserted, and it's in the entry
+  // computation.
+  EXPECT_EQ(CountCopies(*module), 1);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::While(op::Copy(op::Parameter())));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 504b61d..fe1ef78 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -20,7 +20,7 @@
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "mkl_deps",
 )
 
 # Filegroup used to collect source files for dependency checking.
@@ -55,11 +55,23 @@
 )
 
 cc_library(
+    name = "buffer_info_util",
+    srcs = ["buffer_info_util.cc"],
+    hdrs = ["buffer_info_util.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
     name = "cpu_compiler",
     srcs = ["cpu_compiler.cc"],
     hdrs = ["cpu_compiler.h"],
     deps = [
         ":compiler_functor",
+        ":buffer_info_util",
         ":conv_canonicalization",
         ":cpu_copy_insertion",
         ":cpu_executable",
@@ -73,6 +85,8 @@
         ":ir_emitter",
         ":parallel_task_assignment",
         ":simple_orc_jit",
+        "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -87,6 +101,7 @@
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convolution_feature_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -484,10 +499,7 @@
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
-    ] + if_mkl([
-        "@mkl_dnn",
-        "//third_party/mkl:intel_binary_blob",
-    ]),
+    ] + mkl_deps(),
 )
 
 cc_library(
@@ -541,10 +553,7 @@
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
-    ] + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]),
+    ] + mkl_deps(),
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc
new file mode 100644
index 0000000..408fe0f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
+
+namespace xla {
+namespace cpu {
+
+using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo;
+
+std::vector<BufferInfo> CreateBufferInfosFromBufferAssignment(
+    const BufferAssignment& buffer_assignment) {
+  std::vector<BufferInfo> buffer_infos;
+  for (const BufferAllocation& allocation : buffer_assignment.Allocations()) {
+    if (allocation.is_thread_local()) {
+      buffer_infos.push_back(BufferInfo::MakeOnStackBuffer(allocation.size()));
+    } else if (allocation.is_constant()) {
+      buffer_infos.push_back(BufferInfo::MakeConstant(allocation.size()));
+    } else if (allocation.is_entry_computation_parameter()) {
+      buffer_infos.push_back(BufferInfo::MakeEntryParameter(
+          /*size=*/allocation.size(),
+          /*param_number=*/allocation.parameter_number()));
+    } else {
+      buffer_infos.push_back(BufferInfo::MakeTempBuffer(allocation.size()));
+    }
+  }
+  return buffer_infos;
+}
+
+std::vector<int32> CreateArgIndexTableFromBufferInfos(
+    tensorflow::gtl::ArraySlice<BufferInfo> buffer_infos) {
+  std::vector<int32> result;
+  for (int64 i = 0; i < buffer_infos.size(); i++) {
+    if (buffer_infos[i].is_entry_parameter()) {
+      if (buffer_infos[i].entry_parameter_number() >= result.size()) {
+        result.resize(buffer_infos[i].entry_parameter_number() + 1);
+      }
+      result[buffer_infos[i].entry_parameter_number()] = i;
+    }
+  }
+  return result;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.h b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h
new file mode 100644
index 0000000..05de70c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
+
+#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+namespace cpu {
+// Creates and returns a list of BufferInfo instances containing relevant
+// information from `buffer_assignment`.
+std::vector<::tensorflow::cpu_function_runtime::BufferInfo>
+CreateBufferInfosFromBufferAssignment(
+    const BufferAssignment& buffer_assignment);
+
+// Creates and returns a table containing the mapping from entry computation
+// parameters to buffer allocation indices.
+//
+// If this function returns V then entry parameter i has buffer allocation index
+// V[i].
+std::vector<int32> CreateArgIndexTableFromBufferInfos(
+    tensorflow::gtl::ArraySlice<::tensorflow::cpu_function_runtime::BufferInfo>
+        buffer_infos);
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 8cbe9a1..fde8fbd 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -50,6 +50,8 @@
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
@@ -87,6 +89,7 @@
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -103,6 +106,7 @@
 
 namespace xla {
 namespace cpu {
+using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo;
 
 CpuAotCompilationOptions::CpuAotCompilationOptions(
     string triple, string cpu_name, string features, string entry_point_name,
@@ -120,11 +124,11 @@
 }
 
 CpuAotCompilationResult::CpuAotCompilationResult(
-    ObjectFileData object_file_data, BufferSizes buffer_sizes,
+    ObjectFileData object_file_data, std::vector<BufferInfo> buffer_infos,
     int64 result_buffer_index,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
     : object_file_data_(std::move(object_file_data)),
-      buffer_sizes_(std::move(buffer_sizes)),
+      buffer_infos_(std::move(buffer_infos)),
       result_buffer_index_(result_buffer_index),
       hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
@@ -255,6 +259,7 @@
   pipeline.AddPass<CallInliner>();
   pipeline.AddPass<BatchDotSimplification>();
   pipeline.AddPass<DotDecomposer>();
+  pipeline.AddPass<ConvolutionFeatureGroupConverter>();
   pipeline.AddPass<ConvCanonicalization>(&target_machine_features);
   {
     auto& pass =
@@ -273,7 +278,7 @@
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
-    pipeline.AddPass<ZeroSizedHloElimination>();
+    pass.AddPass<ZeroSizedHloElimination>();
 
     pass.AddPass<WhileLoopInvariantCodeMotion>();
     pass.AddPass<TupleSimplifier>();
@@ -297,6 +302,8 @@
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
 
+  pipeline.AddPass<ScatterExpander>();
+
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -354,7 +361,7 @@
   llvm::TargetOptions target_options;
   llvm_ir::SetTargetOptions(
       /*fast_math_enabled=*/module_config.debug_options()
-          .xla_enable_fast_math(),
+          .xla_cpu_enable_fast_math(),
       &target_options);
   return target_options;
 }
@@ -521,7 +528,7 @@
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
-      module->config().debug_options().xla_enable_fast_math(),
+      module->config().debug_options().xla_cpu_enable_fast_math(),
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
       pre_optimization_ir_hook, post_optimization_ir_hook);
   llvm_module->setDataLayout(jit->data_layout());
@@ -651,9 +658,9 @@
   // so we bail if the configs have conflicting flags. At the moment, the only
   // flag that needs to be consistent is fast-math.
   const bool fast_math_enabled =
-      modules[0]->config().debug_options().xla_enable_fast_math();
+      modules[0]->config().debug_options().xla_cpu_enable_fast_math();
   for (const auto& module : modules) {
-    if (module->config().debug_options().xla_enable_fast_math() !=
+    if (module->config().debug_options().xla_cpu_enable_fast_math() !=
         fast_math_enabled) {
       return InvalidArgument(
           "All HLO module configs must have the same value for "
@@ -830,7 +837,7 @@
     CompilerFunctor compiler_functor(
         target_machine.get(), &disassembler, opt_level,
         options::OptimizeForSizeRequested(module->config()),
-        module->config().debug_options().xla_enable_fast_math(),
+        module->config().debug_options().xla_cpu_enable_fast_math(),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
         pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook);
     std::unique_ptr<llvm::MemoryBuffer> object_file =
@@ -838,39 +845,14 @@
     ObjectFileData object_file_data(object_file->getBufferStart(),
                                     object_file->getBufferEnd());
 
-    BufferSizes buffer_sizes;
-    for (const BufferAllocation& allocation : assignment->Allocations()) {
-      // Callers don't need to allocate anything for thread-local temporary
-      // buffers.  They are lowered to allocas.
-      if (allocation.is_thread_local()) {
-        buffer_sizes.push_back(-1);
-        continue;
-      }
-
-      // Callers don't need to allocate anything for constant buffers.  They are
-      // lowered to globals.
-      if (allocation.is_constant()) {
-        buffer_sizes.push_back(-1);
-        continue;
-      }
-
-      // Callers don't need to allocate anything for entry computation buffers,
-      // but they do need to stash the pointer to the entry computation buffer
-      // in the temp buffer table.  See the comment on
-      // XlaCompiledCpuFunction::StaticData::temp_sizes.
-      if (allocation.is_entry_computation_parameter()) {
-        buffer_sizes.push_back(-allocation.parameter_number() - 2);
-        continue;
-      }
-
-      buffer_sizes.push_back(allocation.size());
-    }
+    std::vector<BufferInfo> buffer_infos =
+        CreateBufferInfosFromBufferAssignment(*assignment);
 
     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
                         assignment->GetUniqueTopLevelOutputSlice());
 
     results.emplace_back(MakeUnique<CpuAotCompilationResult>(
-        std::move(object_file_data), std::move(buffer_sizes),
+        std::move(object_file_data), std::move(buffer_infos),
         result_slice.index(), std::move(hlo_profile_printer_data)));
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index e56f9f0..04e1c48 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -19,6 +19,7 @@
 #include <memory>
 
 #include "llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
@@ -78,7 +79,8 @@
 class CpuAotCompilationResult : public AotCompilationResult {
  public:
   CpuAotCompilationResult(
-      ObjectFileData object_file_data, BufferSizes buffer_sizes,
+      ObjectFileData object_file_data,
+      std::vector<::tensorflow::cpu_function_runtime::BufferInfo> buffer_infos,
       int64 result_buffer_index,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
   ~CpuAotCompilationResult();
@@ -88,17 +90,20 @@
   }
 
   const ObjectFileData& object_file_data() const { return object_file_data_; }
-  const BufferSizes& buffer_sizes() const { return buffer_sizes_; }
+  const std::vector<::tensorflow::cpu_function_runtime::BufferInfo>&
+  buffer_infos() const {
+    return buffer_infos_;
+  }
   int64 result_buffer_index() const { return result_buffer_index_; }
 
  private:
   // Contains the compiled computation: an object file.
   const ObjectFileData object_file_data_;
 
-  // The list of buffer sizes which should be allocated in order to execute the
-  // compiled computation.  These buffers are used for temporary buffers used
-  // ephemerally during computation as well as the output result.
-  const BufferSizes buffer_sizes_;
+  // A list of BufferInfo objects describing the buffers used by the XLA
+  // computation.
+  const std::vector<::tensorflow::cpu_function_runtime::BufferInfo>
+      buffer_infos_;
 
   // Contains which buffer index into |buffer_sizes| was designated to the
   // result of the computation.  This buffer should be passed into the output
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 946f512..c376864 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -249,24 +249,11 @@
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
-  if (GetRootPointsToSet().IsAmbiguous()) {
-    return Unimplemented("Points-to set of root instruction is ambiguous");
-  }
-
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-
-  std::vector<OwningDeviceMemory> owning_buffers;
-  std::vector<se::DeviceMemoryBase> unowning_buffers;
   TF_ASSIGN_OR_RETURN(
-      std::tie(unowning_buffers, owning_buffers),
-      CreateTempArray(memory_allocator, stream->parent()->device_ordinal(),
-                      arguments));
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
-      &run_options->run_options(), unowning_buffers, hlo_execution_profile));
-
-  return CreateResultShapedBuffer(run_options, &owning_buffers);
+      auto result,
+      ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
+  TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
+  return std::move(result);
 }
 
 StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
@@ -277,6 +264,16 @@
         "Asynchronous execution on stream with hlo profiling is not yet "
         "supported on CPU.");
   }
+  return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr);
+}
+
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  if (GetRootPointsToSet().IsAmbiguous()) {
+    return Unimplemented("Points-to set of root instruction is ambiguous");
+  }
 
   auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
@@ -310,19 +307,20 @@
     ServiceExecutableRunOptions run_options;
     std::vector<se::DeviceMemoryBase> unowning_buffers;
     std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+    HloExecutionProfile* hlo_execution_profile;
 
     void operator()() {
       // Failing a CHECK here is not great, but I don't see an obvious way to
       // return a failed Status asynchronously.
       TF_CHECK_OK(executable->ExecuteComputeFunction(
-          &run_options.run_options(), unowning_buffers,
-          /*hlo_execution_profile=*/nullptr));
+          &run_options.run_options(), unowning_buffers, hlo_execution_profile));
     }
   };
   host_stream->EnqueueTask(
       AsyncRunTask{this, *run_options, std::move(unowning_buffers),
                    std::make_shared<std::vector<OwningDeviceMemory>>(
-                       std::move(owning_buffers))});
+                       std::move(owning_buffers)),
+                   hlo_execution_profile});
 
   return std::move(result);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 8af8a5d..96e53de 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -85,6 +85,16 @@
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
 
  private:
+  // This is for sharing the code between ExecuteOnStream and
+  // ExecuteAsyncOnStream.
+  //
+  // Notice that it's tricky to use correctly, as the profile object (when it
+  // exists) must out-live the task.
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamImpl(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      HloExecutionProfile* hlo_execution_profile);
+
   // Creates an array suitable for passing as the "temps" argument to the JIT
   // compiled function pointer.
   //
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 991b14f..e6130c7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -697,8 +697,9 @@
       HloInstruction::CreateBinary(dot_shape, HloOpcode::kAdd, dot, addend));
 
   if (add_extra_use_for_dot) {
+    auto* token = builder.AddInstruction(HloInstruction::CreateToken());
     builder.AddInstruction(
-        HloInstruction::CreateOutfeed(dot_shape, dot, "no_config"));
+        HloInstruction::CreateOutfeed(dot_shape, dot, token, "no_config"));
   }
 
   module->AddEntryComputation(builder.Build());
@@ -791,11 +792,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[3,2] broadcast(one), dimensions={}
   ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
@@ -807,11 +808,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
   ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
@@ -823,11 +824,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -839,11 +840,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -855,11 +856,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -871,11 +872,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[1,1] broadcast(one), dimensions={}
   ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
@@ -887,11 +888,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
   ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 156166b..59bc7e0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -173,7 +173,7 @@
 
 Status CpuTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, const Shape& literal_shape,
-    Literal* literal) {
+    MutableBorrowingLiteral literal) {
   if (!ShapeUtil::IsTuple(literal_shape)) {
     int64 size = GetByteSizeRequirement(literal_shape);
     // Note: OSS build didn't like implicit conversion from
@@ -181,18 +181,16 @@
     tensorflow::gtl::ArraySlice<int64> dimensions(
         tensorflow::bit_cast<const int64*>(literal_shape.dimensions().data()),
         literal_shape.dimensions().size());
-    *literal = std::move(*LiteralUtil::CreateFromDimensions(
-        literal_shape.element_type(), dimensions));
-    TF_ASSIGN_OR_RETURN(Shape received_shape,
-                        TransferArrayBufferFromOutfeed(
-                            executor, literal->untyped_data(), size));
-    TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal->shape()))
+    TF_ASSIGN_OR_RETURN(
+        Shape received_shape,
+        TransferArrayBufferFromOutfeed(executor, literal.untyped_data(), size));
+    TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal.shape()))
         << "Shape received from outfeed "
         << ShapeUtil::HumanString(received_shape)
         << " did not match the shape that was requested for outfeed: "
         << ShapeUtil::HumanString(literal_shape);
     TF_RET_CHECK(size == GetByteSizeRequirement(received_shape));
-    *literal->mutable_shape_do_not_use() = received_shape;
+    *literal.mutable_shape_do_not_use() = received_shape;
     return Status::OK();
   }
 
@@ -201,22 +199,12 @@
         "Nested tuple outfeeds are not yet implemented on CPU.");
   }
 
-  std::vector<std::unique_ptr<Literal>> elements;
   std::vector<std::pair<void*, int64>> buffer_data;
   for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) {
     const Shape& tuple_element_shape =
         ShapeUtil::GetTupleElementShape(literal_shape, i);
-    // Note: OSS build didn't like implicit conversion from
-    // literal_shape.dimensions() to the array slice on 2017-07-10.
-    tensorflow::gtl::ArraySlice<int64> dimensions(
-        tensorflow::bit_cast<const int64*>(
-            tuple_element_shape.dimensions().data()),
-        tuple_element_shape.dimensions().size());
-    auto empty = LiteralUtil::CreateFromDimensions(
-        tuple_element_shape.element_type(), dimensions);
     int64 size = GetByteSizeRequirement(tuple_element_shape);
-    buffer_data.push_back({empty->untyped_data(), size});
-    elements.push_back(std::move(empty));
+    buffer_data.push_back({literal.untyped_data({i}), size});
   }
 
   TF_ASSIGN_OR_RETURN(Shape received_shape,
@@ -230,11 +218,7 @@
   TF_RET_CHECK(GetByteSizeRequirement(literal_shape) ==
                GetByteSizeRequirement(received_shape));
 
-  for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) {
-    *elements[i]->mutable_shape_do_not_use() = received_shape.tuple_shapes(i);
-  }
-  *literal = std::move(*LiteralUtil::MakeTupleOwned(std::move(elements)));
-  TF_RET_CHECK(ShapeUtil::Equal(literal->shape(), literal_shape));
+  TF_RET_CHECK(ShapeUtil::Equal(literal.shape(), literal_shape));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 593575c..80ef953 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -18,6 +18,7 @@
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -41,7 +42,7 @@
                                  const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
-                                    Literal* literal) override;
+                                    MutableBorrowingLiteral literal) override;
 
  private:
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 645888d..f2ac742 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -1066,7 +1066,7 @@
           << config.GetCacheKey();
 
   const bool enable_fast_math =
-      hlo_module_config_.debug_options().xla_enable_fast_math();
+      hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
   const bool optimize_for_size =
       options::OptimizeForSizeRequested(hlo_module_config_);
 
@@ -1149,7 +1149,7 @@
       swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer();
 
   const bool enable_fast_math =
-      hlo_module_config_.debug_options().xla_enable_fast_math();
+      hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
   const bool optimize_for_size =
       options::OptimizeForSizeRequested(hlo_module_config_);
 
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index c13d367..db54454 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -30,47 +30,6 @@
 namespace xla {
 namespace cpu {
 
-StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
-    const HloInstruction* op, llvm::Value* operand_value) const {
-  switch (op->opcode()) {
-    case HloOpcode::kTanh: {
-      PrimitiveType element_type = op->shape().element_type();
-      bool cast_result_to_fp16 = false;
-      string function_name;
-      switch (element_type) {
-        case F16:
-          cast_result_to_fp16 = true;
-          operand_value = b_->CreateFPCast(operand_value, b_->getFloatTy());
-          TF_FALLTHROUGH_INTENDED;
-        case F32:
-          function_name = "tanhf";
-          break;
-        case F64:
-          function_name = "tanh";
-          break;
-        default:
-          return Unimplemented("tanh");
-      }
-      // Create a function declaration.
-      llvm::Function* function =
-          llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-              llvm_ir::AsStringRef(function_name), operand_value->getType(),
-              operand_value->getType()));
-      function->setCallingConv(llvm::CallingConv::C);
-      function->setDoesNotThrow();
-      function->setDoesNotAccessMemory();
-      // Create an instruction to call the function.
-      llvm::Value* result = b_->CreateCall(function, operand_value);
-      if (cast_result_to_fp16) {
-        result = b_->CreateFPCast(result, b_->getHalfTy());
-      }
-      return result;
-    }
-    default:
-      return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
-  }
-}
-
 StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
     PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
   string function_name;
@@ -106,6 +65,39 @@
   return result;
 }
 
+StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  bool cast_result_to_fp16 = false;
+  string function_name;
+  switch (prim_type) {
+    case F16:
+      cast_result_to_fp16 = true;
+      value = b_->CreateFPCast(value, b_->getFloatTy());
+      TF_FALLTHROUGH_INTENDED;
+    case F32:
+      function_name = "tanhf";
+      break;
+    case F64:
+      function_name = "tanh";
+      break;
+    default:
+      return Unimplemented("tanh");
+  }
+  // Create a function declaration.
+  llvm::Function* function = llvm::cast<llvm::Function>(
+      module_->getOrInsertFunction(llvm_ir::AsStringRef(function_name),
+                                   value->getType(), value->getType()));
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDoesNotThrow();
+  function->setDoesNotAccessMemory();
+  // Create an instruction to call the function.
+  llvm::Value* result = b_->CreateCall(function, value);
+  if (cast_result_to_fp16) {
+    result = b_->CreateFPCast(result, b_->getHalfTy());
+  }
+  return result;
+}
+
 llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) const {
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 9598a88..76833e7 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -39,10 +39,10 @@
       const HloToElementGeneratorMap& operand_to_generator) const override;
 
  protected:
-  StatusOr<llvm::Value*> EmitFloatUnaryOp(
-      const HloInstruction* op, llvm::Value* operand_value) const override;
   StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
                                    llvm::Value* rhs) const override;
+  StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                  llvm::Value* value) const override;
 
   IrEmitter* ir_emitter_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ca645d3..6f433b4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -99,7 +99,7 @@
       target_machine_features_(*target_machine_features) {
   b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
-          .xla_enable_fast_math()));
+          .xla_cpu_enable_fast_math()));
 }
 
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
@@ -158,11 +158,11 @@
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
   // Create and initialize new IrFunction.
-  compute_function_.reset(
-      new IrFunction(function_name, linkage,
-                     options::OptimizeForSizeRequested(hlo_module_config_),
-                     hlo_module_config_.debug_options().xla_enable_fast_math(),
-                     module_, &b_, num_dynamic_loop_bounds_));
+  compute_function_.reset(new IrFunction(
+      function_name, linkage,
+      options::OptimizeForSizeRequested(hlo_module_config_),
+      hlo_module_config_.debug_options().xla_cpu_enable_fast_math(), module_,
+      &b_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -577,7 +577,7 @@
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window,
       /*operands=*/{reduce_window->operand(0)},
-      /*supported_types=*/{F32, BF16, S32}));
+      /*supported_types=*/{F32, BF16, S32, F16}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(reduce_window->window())) {
@@ -1756,6 +1756,10 @@
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
+  // TODO(b/112040122): Support variadic reduce.
+  if (!ShapeUtil::IsArray(reduce->shape())) {
+    return Unimplemented("Variadic reduce is not supported on CPU");
+  }
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
   gtl::ArraySlice<int64> dimensions(reduce->dimensions());
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index 36c9f74..ee272b5 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -110,9 +110,10 @@
   const string hlo_string = R"(
     HloModule TestTaskParallel_infeed_outfeed
     ENTRY InfeedOutfeed {
-      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed()
+      token = token[] after-all()
+      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token)
       infeed0.data = u32[12345678,2]{1,0} get-tuple-element((u32[12345678,2]{1,0}, token[]) infeed0), index=0
-      ROOT outfeed0 = token[] outfeed(infeed0.data)
+      ROOT outfeed0 = token[] outfeed(infeed0.data, token)
     }
   )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
index 997fdd2..8dc5f3c 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL) && !defined(INTEL_MKL_DNN_ONLY)
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "third_party/intel_mkl_ml/include/mkl_cblas.h"
 #include "third_party/intel_mkl_ml/include/mkl_service.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 90b99c8..3b87683 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -38,7 +38,8 @@
 
 while_cond {
   arg_cond = f32[2,3,2] parameter(0)
-  infeed = (pred[], token[]) infeed()
+  token = token[] after-all()
+  infeed = (pred[], token[]) infeed(token)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
@@ -50,8 +51,9 @@
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
   const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body
 
-  out0 = token[] outfeed(f32[2,3,2] const_a)
-  ROOT out1 = token[] outfeed(f32[2,3,2] const_b)
+  token = token[] after-all()
+  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token)
+  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token)
 }
 )";
 
@@ -85,7 +87,8 @@
 
 while_cond {
   arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  infeed = (pred[], token[]) infeed()
+  token = token[] after-all()
+  infeed = (pred[], token[]) infeed(token)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
@@ -94,8 +97,9 @@
   const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
   const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
 
-  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a)
-  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b)
+  token = token[] after-all()
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token)
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index dac416e..780c07f 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -32,7 +32,8 @@
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 
-  outfeed = token[] outfeed(f32[2,3,2] const_a)
+  token = token[] after-all()
+  outfeed = token[] outfeed(f32[2,3,2] const_a, token)
   ROOT root = () tuple()
 }
 )";
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index d938f3a..48e4471 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -21,8 +21,33 @@
 
 namespace xla {
 
+namespace {
+
+// Pass which strips control dependencies from all instructions in the module.
+class ControlDepRemover : public HloPassInterface {
+ public:
+  ControlDepRemover() = default;
+  tensorflow::StringPiece name() const override {
+    return "control-dep-remover";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override {
+    bool changed = false;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        changed = changed || !instruction->control_predecessors().empty();
+        TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+      }
+    }
+    return changed;
+  }
+};
+
+}  // namespace
+
 Despecializer::Despecializer() : pipeline_("despecializer") {
   // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
   pipeline_.AddPass<ImplicitBroadcastRemover>();
   pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 9f86749..86d5758 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -106,6 +106,7 @@
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
   virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index ae8a066..617a5a2 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -94,6 +94,9 @@
   Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
+  Status HandleAllToAll(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 6aab317..891ae42 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -431,6 +431,8 @@
       return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
       return EmitSin(op->shape().element_type(), operand_value);
+    case HloOpcode::kTanh:
+      return EmitTanh(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
                                           {operand_value},
@@ -1060,6 +1062,11 @@
   return Unimplemented("atan2");
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
+                                                    llvm::Value* value) const {
+  return Unimplemented("tanh");
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
     const HloInstruction* hlo, llvm::Value* x) const {
   if (hlo->operand(0)->shape().element_type() != F32) {
@@ -1665,22 +1672,21 @@
   std::vector<int64> operand_to_output_dim(operand_shape.dimensions_size(), -1);
   for (int64 i = 0, e = operand_shape.dimensions_size(), operand_index_dim = 0;
        i < e; i++) {
-    if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+    if (c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
       operand_index.push_back(index.GetConstantWithIndexType(0));
     } else {
-      int64 output_window_dim =
-          dim_numbers.output_window_dims(operand_index_dim++);
+      int64 output_window_dim = dim_numbers.offset_dims(operand_index_dim++);
       operand_to_output_dim[i] = output_window_dim;
       operand_index.push_back(index[output_window_dim]);
     }
   }
 
-  // This is the index of the index vector in the gather_indices tensor.
+  // This is the index of the index vector in the start_indices tensor.
   IrArray::Index gather_index_index(index_type);
   {
     std::vector<llvm::Value*> gather_index_index_components;
     for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) {
-      if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+      if (!c_binary_search(dim_numbers.offset_dims(), i)) {
         gather_index_index.push_back(index[i]);
       }
     }
@@ -1693,7 +1699,7 @@
   auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) {
     llvm::Value* gather_dim_component_extended =
         b_->CreateSExtOrTrunc(index_component, index_type);
-    int64 operand_dim = dim_numbers.gather_dims_to_operand_dims(dim);
+    int64 operand_dim = dim_numbers.start_index_map(dim);
     int64 output_dim = operand_to_output_dim[operand_dim];
     // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim.
     // This means we set the iteration index to 0, so for the purpose of the
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index fcb3455..1598a4d 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -122,6 +122,9 @@
                                          llvm::Value* lhs,
                                          llvm::Value* rhs) const;
 
+  virtual StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                          llvm::Value* value) const;
+
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x) const;
 
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index e3a42d0..9370c88 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -27,85 +27,85 @@
 using tensorflow::gtl::ArraySlice;
 
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
-  const Shape& gather_indices_shape = gather_indices->shape();
+    HloInstruction* start_indices, int64 index_vector_dim) {
+  const Shape& start_indices_shape = start_indices->shape();
 
-  if (gather_indices_shape.dimensions_size() == index_vector_dim) {
-    return gather_indices;
+  if (start_indices_shape.dimensions_size() == index_vector_dim) {
+    return start_indices;
   }
 
-  if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) {
-    return gather_indices;
+  if (index_vector_dim == (start_indices_shape.dimensions_size() - 1)) {
+    return start_indices;
   }
 
   std::vector<int64> permutation;
-  permutation.reserve(gather_indices_shape.dimensions_size());
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  permutation.reserve(start_indices_shape.dimensions_size());
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != index_vector_dim) {
       permutation.push_back(i);
     }
   }
   permutation.push_back(index_vector_dim);
-  return MakeTransposeHlo(gather_indices, permutation);
+  return MakeTransposeHlo(start_indices, permutation);
 }
 
-// Canonicalizes the gather_indices tensors so that we only have deal with some
+// Canonicalizes the start_indices tensors so that we only have deal with some
 // specific cases in the while loop that does the heavy lifting.
 //
 // See the "High Level Algorithm" section for a broader picture.
 static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
+    HloInstruction* start_indices, int64 index_vector_dim) {
   // Transpose the non-index-vector dimensions to the front.
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * transposed_gather_indices,
-      TransposeIndexVectorDimToLast(gather_indices, index_vector_dim));
+      HloInstruction * transposed_start_indices,
+      TransposeIndexVectorDimToLast(start_indices, index_vector_dim));
   bool indices_are_scalar =
-      index_vector_dim == gather_indices->shape().dimensions_size();
+      index_vector_dim == start_indices->shape().dimensions_size();
 
-  // The number of dimensions in gather_indices that are index dimensions.
-  const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1;
+  // The number of dimensions in start_indices that are index dimensions.
+  const int64 index_dims_in_start_indices = indices_are_scalar ? 0 : 1;
 
-  // If there is only one index (i.e. gather_indices has rank 1 and this gather
+  // If there is only one index (i.e. start_indices has rank 1 and this gather
   // is really just a dynamic slice) add a leading degenerate dimension for
   // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
   // all of the non-index-vector dimensions.
-  const Shape& shape = transposed_gather_indices->shape();
-  if (shape.dimensions_size() == index_dims_in_gather_indices) {
-    return PrependDegenerateDims(transposed_gather_indices, 1);
+  const Shape& shape = transposed_start_indices->shape();
+  if (shape.dimensions_size() == index_dims_in_start_indices) {
+    return PrependDegenerateDims(transposed_start_indices, 1);
   } else {
-    // Collapse all but the dimensions (0 or 1) in gather_indices containing the
+    // Collapse all but the dimensions (0 or 1) in start_indices containing the
     // index vectors.
     return CollapseFirstNDims(
-        transposed_gather_indices,
-        shape.dimensions_size() - index_dims_in_gather_indices);
+        transposed_start_indices,
+        shape.dimensions_size() - index_dims_in_start_indices);
   }
 }
 
 // Expands out or contracts away the gather dimensions in the accumulator
 // produced by the while loop.
-static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
-    const Shape& gather_indices_shape, HloInstruction* accumulator,
+static StatusOr<HloInstruction*> AdjustBatchDimsInAccumulator(
+    const Shape& start_indices_shape, HloInstruction* accumulator,
     int64 index_vector_dim) {
-  std::vector<int64> output_gather_dim_bounds;
-  output_gather_dim_bounds.reserve(gather_indices_shape.dimensions_size());
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  std::vector<int64> batch_dim_bounds;
+  batch_dim_bounds.reserve(start_indices_shape.dimensions_size());
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != index_vector_dim) {
-      output_gather_dim_bounds.push_back(gather_indices_shape.dimensions(i));
+      batch_dim_bounds.push_back(start_indices_shape.dimensions(i));
     }
   }
 
-  if (output_gather_dim_bounds.empty()) {
-    // If output_gather_dim_bounds is empty we must be lowering a (effectively)
+  if (batch_dim_bounds.empty()) {
+    // If batch_dim_bounds is empty we must be lowering a (effectively)
     // dynamic-slice.  In that case, there is a leading degenerate gather
     // dimension that we added to make this special case play well with the
     // general while loop which we need to remove now.
     return ElideDegenerateDims(accumulator, {0});
   }
 
-  return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
+  return ExpandFirstDimIntoNDims(accumulator, batch_dim_bounds);
 }
 
-// Expand an index vector from the gather_indices tensor into a vector that can
+// Expand an index vector from the start_indices tensor into a vector that can
 // be used to dynamic-slice out of the gather operand.
 static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
     HloInstruction* index_vector, const GatherDimensionNumbers& dim_numbers,
@@ -121,10 +121,8 @@
   std::vector<HloInstruction*> expanded_index_components;
 
   for (int i = 0; i < operand_rank; i++) {
-    int64 index_vector_dim_index =
-        FindIndex(dim_numbers.gather_dims_to_operand_dims(), i);
-    if (index_vector_dim_index !=
-        dim_numbers.gather_dims_to_operand_dims_size()) {
+    int64 index_vector_dim_index = FindIndex(dim_numbers.start_index_map(), i);
+    if (index_vector_dim_index != dim_numbers.start_index_map_size()) {
       TF_ASSIGN_OR_RETURN(
           HloInstruction * component_to_concat,
           MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
@@ -147,10 +145,10 @@
   const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers();
   CHECK_EQ(incoming_loop_state.size(), 3);
   HloInstruction* const operand = incoming_loop_state[0];
-  HloInstruction* const gather_indices = incoming_loop_state[1];
+  HloInstruction* const start_indices = incoming_loop_state[1];
   HloInstruction* const output_accumulator = incoming_loop_state[2];
 
-  bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1;
+  bool has_scalar_indices = start_indices->shape().dimensions_size() == 1;
   CHECK_EQ(has_scalar_indices,
            dim_numbers.index_vector_dim() ==
                gather.operand(1)->shape().dimensions_size());
@@ -163,24 +161,24 @@
   HloInstruction* index_vector;
 
   if (has_scalar_indices) {
-    // In this case gather_indices has rank 1 and induction_var_as_vector (of
+    // In this case start_indices has rank 1 and induction_var_as_vector (of
     // shape {1}) is an index into this rank 1 tensor.
     TF_ASSIGN_OR_RETURN(
         index_vector,
-        MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1}));
+        MakeDynamicSliceHlo(start_indices, induction_var_as_vector, {1}));
   } else {
-    // In this case gather_indices has rank 2 and induction_var_as_vector (of
+    // In this case start_indices has rank 2 and induction_var_as_vector (of
     // shape {1}) is an index into just the first dimension of this rank 2
     // tensor.
     TF_ASSIGN_OR_RETURN(
-        HloInstruction * index_into_gather_indices,
+        HloInstruction * index_into_start_indices,
         PadVectorWithZeros(induction_var_as_vector,
                            /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
 
-    int64 index_vector_size = gather_indices->shape().dimensions(1);
+    int64 index_vector_size = start_indices->shape().dimensions(1);
     TF_ASSIGN_OR_RETURN(
         HloInstruction * index_vector_2d,
-        MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+        MakeDynamicSliceHlo(start_indices, index_into_start_indices,
                             {1, index_vector_size}));
 
     TF_ASSIGN_OR_RETURN(index_vector,
@@ -194,26 +192,26 @@
 
   TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
-                                          gather.gather_window_bounds()));
+                                          gather.gather_slice_sizes()));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_with_dims_elided,
+      HloInstruction* const gathered_slice_with_dims_collapsed,
       ElideDegenerateDims(gathered_slice,
-                          AsInt64Slice(dim_numbers.elided_window_dims())));
+                          AsInt64Slice(dim_numbers.collapsed_slice_dims())));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_for_update,
-      PrependDegenerateDims(gathered_slice_with_dims_elided, 1));
+      HloInstruction* const gathered_slice_for_update,
+      PrependDegenerateDims(gathered_slice_with_dims_collapsed, 1));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_vector_into_accumulator,
+      HloInstruction* const index_vector_into_accumulator,
       PadVectorWithZeros(
           induction_var_as_vector, /*zeros_to_prepend=*/0,
           /*zeros_to_append=*/
-          gathered_slice_with_dims_elided->shape().dimensions_size()));
+          gathered_slice_with_dims_collapsed->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * updated_accumulator,
+      HloInstruction* const updated_accumulator,
       MakeDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update,
                                 index_vector_into_accumulator));
 
@@ -221,19 +219,19 @@
   // WhileUtil::MakeCountedLoop functions takes care of the induction variable
   // and the while loop exit condition.
   return StatusOr<std::vector<HloInstruction*>>{
-      {operand, gather_indices, updated_accumulator}};
+      {operand, start_indices, updated_accumulator}};
 }
 
 static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
-    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count,
+    ArraySlice<int64> slice_sizes, int64 gather_loop_trip_count,
     const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> accumulator_state_shape_dims;
-  accumulator_state_shape_dims.reserve(1 + window_bounds.size());
+  accumulator_state_shape_dims.reserve(1 + slice_sizes.size());
   accumulator_state_shape_dims.push_back(gather_loop_trip_count);
-  for (int64 i = 0; i < window_bounds.size(); i++) {
-    if (!c_binary_search(dim_numbers.elided_window_dims(), i)) {
-      accumulator_state_shape_dims.push_back(window_bounds[i]);
+  for (int64 i = 0; i < slice_sizes.size(); i++) {
+    if (!c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
+      accumulator_state_shape_dims.push_back(slice_sizes[i]);
     }
   }
   return BroadcastZeros(computation, element_type,
@@ -241,23 +239,23 @@
 }
 
 // `accumulator` is almost the tensor the gather operation would have produced,
-// except that it has the dimensions in the wrong order -- the gather dimensions
-// are the major dimensions and the window dimensions are the minor dimensions.
+// except that it has the dimensions in the wrong order -- the batch dimensions
+// are the major dimensions and the offset dimensions are the minor dimensions.
 // Fix this up with a transpose.
-static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
-    HloInstruction* accumulator, ArraySlice<int64> output_window_dims,
+static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
+    HloInstruction* accumulator, ArraySlice<int64> offset_dims,
     int64 output_rank) {
   std::vector<int64> permutation;
   permutation.reserve(output_rank);
 
-  int64 gather_idx_counter = 0;
-  int64 window_idx_counter = output_rank - output_window_dims.size();
+  int64 batch_idx_counter = 0;
+  int64 offset_idx_counter = output_rank - offset_dims.size();
   for (int64 i = 0; i < output_rank; i++) {
-    bool is_window_dim = c_binary_search(output_window_dims, i);
-    if (is_window_dim) {
-      permutation.push_back(window_idx_counter++);
+    bool is_offset_dim = c_binary_search(offset_dims, i);
+    if (is_offset_dim) {
+      permutation.push_back(offset_idx_counter++);
     } else {
-      permutation.push_back(gather_idx_counter++);
+      permutation.push_back(batch_idx_counter++);
     }
   }
 
@@ -268,11 +266,11 @@
 //
 // We follow the following steps in sequence:
 //
-//  1. We canonicalize the gather_indices tensor such that it has rank
+//  1. We canonicalize the start_indices tensor such that it has rank
 //     2 (i.e. is a matrix) where each row is an index vector into the
 //     operand.
 //  2. We iterate over the set of indices in the canonicalized
-//     gather_indices tensor using a while loop, accumulating slices
+//     start_indices tensor using a while loop, accumulating slices
 //     of the operand tensor into an accumulator using
 //     DynamicUpdateSlice.
 //  3. The accumulator result from the while loop from (2) is then
@@ -287,11 +285,11 @@
 //     operand = s32[3,3] parameter(0)
 //     indices = s32[2,2] parameter(1)
 //     ROOT gather = s32[2,3,2] gather(operand, indices),
-//         output_window_dims={1},
-//         elided_window_dims={1},
-//         gather_dims_to_operand_dims={1},
+//         offset_dims={1},
+//         collapsed_slice_dims={1},
+//         start_index_map={1},
 //         index_vector_dim=2,
-//         window_bounds={3, 1}
+//         slice_sizes={3, 1}
 //   }
 //
 // We'd first reshape indices to s32[4,1], where each row is an index
@@ -305,8 +303,8 @@
 
   HloComputation* computation = gather_instr->parent();
   HloInstruction* operand = gather_instr->mutable_operand(0);
-  HloInstruction* gather_indices = gather_instr->mutable_operand(1);
-  const Shape& gather_indices_shape = gather_indices->shape();
+  HloInstruction* start_indices = gather_instr->mutable_operand(1);
+  const Shape& start_indices_shape = start_indices->shape();
   const Shape& output_shape = gather_instr->shape();
   int64 output_rank = output_shape.dimensions_size();
 
@@ -314,9 +312,9 @@
       gather_instr->gather_dimension_numbers();
 
   int64 gather_loop_trip_count = 1;
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != dim_numbers.index_vector_dim()) {
-      gather_loop_trip_count *= gather_indices_shape.dimensions(i);
+      gather_loop_trip_count *= start_indices_shape.dimensions(i);
     }
   }
 
@@ -327,24 +325,24 @@
         gather_instr->ToString().c_str());
   }
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * canonical_gather_indices,
-                      CanonicalizeGatherIndices(
-                          gather_indices, dim_numbers.index_vector_dim()));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * canonical_start_indices,
+      CanonicalizeGatherIndices(start_indices, dim_numbers.index_vector_dim()));
 
   CHECK_EQ(gather_loop_trip_count,
-           canonical_gather_indices->shape().dimensions(0));
+           canonical_start_indices->shape().dimensions(0));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_init,
       CreateGatherLoopAccumulatorInitValue(
           computation, output_shape.element_type(),
-          gather_instr->gather_window_bounds(), gather_loop_trip_count,
+          gather_instr->gather_slice_sizes(), gather_loop_trip_count,
           gather_instr->gather_dimension_numbers()));
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
           computation, gather_loop_trip_count,
-          {operand, canonical_gather_indices, accumulator_init},
+          {operand, canonical_start_indices, accumulator_init},
           [&](HloInstruction* indvar,
               const std::vector<HloInstruction*>& loop_state) {
             return GatherLoopBody(*gather_instr, indvar, loop_state);
@@ -356,13 +354,13 @@
   HloInstruction* accumulator_result = gather_loop_result.back();
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
-      AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result,
-                                    dim_numbers.index_vector_dim()));
+      HloInstruction* const accumulator_with_batch_dims_decanonicalized,
+      AdjustBatchDimsInAccumulator(start_indices->shape(), accumulator_result,
+                                   dim_numbers.index_vector_dim()));
 
-  return PermuteGatherAndWindowDims(
-      accumulator_with_output_gather_dims_decanonicalized,
-      AsInt64Slice(dim_numbers.output_window_dims()), output_rank);
+  return PermuteBatchAndOffsetDims(accumulator_with_batch_dims_decanonicalized,
+                                   AsInt64Slice(dim_numbers.offset_dims()),
+                                   output_rank);
 }
 
 StatusOr<bool> GatherExpander::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 020ffcd..141dd4d 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -28,11 +28,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2147483647,5] parameter(1)
   ROOT gather = s32[2147483647,3,5] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -55,11 +55,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index e314a46..0ce2db9 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -24,7 +24,6 @@
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -60,17 +59,19 @@
 
 void GenericTransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
-    std::function<void(StatusOr<std::unique_ptr<Literal>>)> done) {
+    MutableBorrowingLiteral literal, std::function<void(Status)> done) {
   Status status = stream->BlockHostUntilDone();
   if (!status.ok()) {
     return done(status);
   }
-  done(TransferLiteralFromDeviceInternal(stream->parent(), device_buffer));
+
+  done(TransferLiteralFromDeviceInternal(stream->parent(), device_buffer,
+                                         literal));
 }
 
-StatusOr<std::unique_ptr<Literal>>
-GenericTransferManager::TransferLiteralFromDeviceInternal(
-    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
+Status GenericTransferManager::TransferLiteralFromDeviceInternal(
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer,
+    MutableBorrowingLiteral literal) {
   VLOG(2) << "transferring literal from device ordinal "
           << executor->device_ordinal() << "; device buffer: " << device_buffer;
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
@@ -80,9 +81,6 @@
   TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
                                 device_buffer.on_host_shape()));
 
-  std::unique_ptr<Literal> literal =
-      Literal::CreateFromShape(device_buffer.on_host_shape());
-
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_host_shape(),
       [&](const Shape& subshape, const ShapeIndex& index) -> Status {
@@ -91,12 +89,12 @@
               /*source=*/device_buffer.buffer(index),
               /*size=*/GetByteSizeRequirement(subshape),
               /*destination=*/
-              literal->untyped_data(index)));
+              literal.untyped_data(index)));
         }
 
         return Status::OK();
       }));
-  return std::move(literal);
+  return Status::OK();
 }
 
 Status GenericTransferManager::TransferLiteralToDeviceAsync(
@@ -160,7 +158,7 @@
 
 Status GenericTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, const Shape& literal_shape,
-    Literal* literal) {
+    MutableBorrowingLiteral literal) {
   return Unimplemented("Generic transfer from Outfeed");
 }
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 3cd002c..6c1a215 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -19,7 +19,6 @@
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -41,9 +40,10 @@
 
   se::Platform::Id PlatformId() const override;
 
-  void TransferLiteralFromDevice(
-      se::Stream* stream, const ShapedBuffer& device_buffer,
-      std::function<void(StatusOr<std::unique_ptr<Literal>>)> done) override;
+  void TransferLiteralFromDevice(se::Stream* stream,
+                                 const ShapedBuffer& device_buffer,
+                                 MutableBorrowingLiteral literal,
+                                 std::function<void(Status)> done) override;
 
   Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
@@ -53,7 +53,7 @@
                                  const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
-                                    Literal* literal) override;
+                                    MutableBorrowingLiteral literal) override;
 
   Status ResetDevices(
       tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
@@ -67,8 +67,9 @@
       const Shape& shape, se::DeviceMemoryBase* region) override;
 
  private:
-  StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDeviceInternal(
-      se::StreamExecutor* executor, const ShapedBuffer& device_buffer);
+  Status TransferLiteralFromDeviceInternal(se::StreamExecutor* executor,
+                                           const ShapedBuffer& device_buffer,
+                                           MutableBorrowingLiteral literal);
 
   // The platform this transfer manager targets.
   const se::Platform::Id platform_id_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 6a0aedc..8ef7285 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
 licenses(["notice"])  # Apache 2.0
@@ -153,7 +154,6 @@
         ":ir_emission_utils",
         ":parallel_loop_emitter",
         ":partition_assignment",
-        ":while_transformer",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -166,6 +166,7 @@
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:name_uniquer",
+        "//tensorflow/compiler/xla/service:while_loop_analysis",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
@@ -361,10 +362,12 @@
     hdrs = ["cudnn_convolution_algorithm_picker.h"],
     deps = [
         ":backend_configs",
+        ":buffer_comparator",
         ":cudnn_convolution_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -636,6 +639,7 @@
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convolution_feature_group_converter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -652,6 +656,7 @@
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -787,32 +792,17 @@
     ],
 )
 
-cc_library(
-    name = "while_transformer",
-    srcs = ["while_transformer.cc"],
-    hdrs = ["while_transformer.h"],
-    deps = [
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:lib",
-    ],
-)
-
 tf_cc_test(
     name = "while_transformer_test",
     srcs = ["while_transformer_test.cc"],
     deps = [
         ":instruction_fusion",
-        ":while_transformer",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:while_loop_analysis",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -867,3 +857,35 @@
         "//tensorflow/core:test",
     ],
 )
+
+cc_library(
+    name = "buffer_comparator",
+    srcs = ["buffer_comparator.cc"],
+    hdrs = ["buffer_comparator.h"],
+    deps = [
+        ":gpu_executable",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+xla_test(
+    name = "buffer_comparator_test",
+    srcs = ["buffer_comparator_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":buffer_comparator",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
new file mode 100644
index 0000000..6a285a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -0,0 +1,205 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+
+#include <cmath>
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace gpu {
+
+static constexpr float kTolerance = 0.1f;
+
+static string GetCompHloText(size_t num_elements) {
+  // Implements the textual format of the comparison routine, as it's more
+  // readable.
+  static constexpr char kF16CompHloText[] = R"(
+HloModule CompareF16
+
+MaxF32 {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %max = f32[] maximum(%lhs, %rhs)
+}
+
+Canonicalize (aparam: f16[SIZE]) -> f32[SIZE] {
+  %min_constant = f32[] constant(-65505)
+  %max_constant = f32[] constant(65505)
+  %large_constant = f32[] constant(1048576)
+  %min_values = f32[SIZE] broadcast(%min_constant), dimensions={}
+  %max_values = f32[SIZE] broadcast(%max_constant), dimensions={}
+  %large_values = f32[SIZE] broadcast(%large_constant), dimensions={}
+
+  %a = f16[SIZE] parameter(0)
+  %converted = f32[SIZE] convert(%a)
+  %clamped = f32[SIZE] clamp(%min_values, %converted, %max_values)
+
+  // Since the clamp() above already took care of infs, only NaNs will cause
+  // is-finite() to return false.
+  %is_finite = pred[SIZE] is-finite(%clamped)
+  ROOT %result = f32[SIZE] select(%is_finite, %clamped, %large_values)
+}
+
+ENTRY MaxDifference {
+  %one_constant = f32[] constant(1.0)
+  %zero_constant = f32[] constant(0.0)
+
+  %ones = f32[SIZE] broadcast(%one_constant), dimensions={}
+
+  %lhs = f16[SIZE] parameter(0)
+  %rhs = f16[SIZE] parameter(1)
+  %lhs_canonical = f32[SIZE] call(%lhs), to_apply=Canonicalize
+  %rhs_canonical = f32[SIZE] call(%rhs), to_apply=Canonicalize
+  %sub = f32[SIZE] subtract(%lhs_canonical, %rhs_canonical)
+  %sub_abs = f32[SIZE] abs(%sub)
+  %lhs_abs = f32[SIZE] abs(%lhs_canonical)
+  %rhs_abs = f32[SIZE] abs(%rhs_canonical)
+  %max = f32[SIZE] maximum(%lhs_abs, %rhs_abs)
+  %denominator = f32[SIZE] add(%max, %ones)
+  %error = f32[SIZE] divide(%sub_abs, %denominator)
+  ROOT %max_diff = f32[] reduce(%error, %zero_constant), dimensions={0}, to_apply=MaxF32
+})";
+  auto size_string = std::to_string(num_elements);
+  return tensorflow::str_util::StringReplace(
+      kF16CompHloText, "SIZE", {size_string.data(), size_string.size()}, true);
+}
+
+StatusOr<F16BufferComparator> F16BufferComparator::Create(
+    se::DeviceMemory<Eigen::half> ref_buffer, Compiler* compiler,
+    DeviceMemoryAllocator* allocator, se::Stream* stream) {
+  auto stream_exec = stream->parent();
+  int64 num_elements = ref_buffer.ElementCount();
+
+  // One may consider using hlo_runner to do all the compilation and execution.
+  // However, as of the time hlo_runner doesn't support injection for Compiler*,
+  // Stream*, or even the allocator. We may revisit this in the future if it
+  // proves to be a maintenance burden.
+  TF_ASSIGN_OR_RETURN(
+      auto exec, ([&]() -> StatusOr<std::unique_ptr<Executable>> {
+        HloModuleConfig config;
+        DebugOptions debug_options;
+        debug_options.set_xla_backend_optimization_level(2);
+        config.set_debug_options(debug_options);
+        TF_ASSIGN_OR_RETURN(
+            auto module, ParseHloString(GetCompHloText(num_elements), config));
+        TF_ASSIGN_OR_RETURN(
+            module,
+            compiler->RunHloPasses(std::move(module), stream_exec, nullptr));
+        return compiler->RunBackend(std::move(module), stream_exec, nullptr);
+      }()));
+
+  TF_ASSIGN_OR_RETURN(
+      auto shaped_buffer, ([&]() -> StatusOr<ScopedShapedBuffer> {
+        auto device_ordinal = stream_exec->device_ordinal();
+        TF_ASSIGN_OR_RETURN(
+            auto owning_buffer,
+            allocator->Allocate(device_ordinal, ref_buffer.size()));
+        se::DeviceMemory<Eigen::half> buffer(
+            owning_buffer.AsDeviceMemoryBase());
+        stream->ThenMemcpy(&buffer, ref_buffer, ref_buffer.size());
+        Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements});
+        ScopedShapedBuffer ret(shape, shape, allocator, device_ordinal);
+        ret.set_buffer(std::move(owning_buffer), {});
+        return std::move(ret);
+      }()));
+
+  return F16BufferComparator(stream, allocator, std::move(exec),
+                             std::move(shaped_buffer));
+}
+
+StatusOr<bool> F16BufferComparator::CompareEqualImpl(
+    se::DeviceMemory<Eigen::half> test_buffer) {
+  if (ref_buffer_.root_buffer().size() != test_buffer.size()) {
+    return InternalError("Mismatched buffer size: %lld vs %lld",
+                         ref_buffer_.root_buffer().size(), test_buffer.size());
+  }
+
+  int64 num_elements = test_buffer.ElementCount();
+
+  TF_ASSIGN_OR_RETURN(
+      auto result_buffer, ([&]() -> StatusOr<ScopedShapedBuffer> {
+        auto stream_exec = stream_->parent();
+        Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements});
+        auto device_ordinal = stream_exec->device_ordinal();
+        ShapedBuffer shaped_test_buffer(shape, shape, stream_exec->platform(),
+                                        device_ordinal);
+        shaped_test_buffer.set_buffer(test_buffer, {});
+        ExecutableRunOptions run_options;
+        run_options.set_device_ordinal(stream_exec->device_ordinal());
+        run_options.set_stream(stream_);
+        run_options.set_allocator(allocator_);
+        ServiceExecutableRunOptions service_run_options(run_options);
+        return exec_->ExecuteOnStream(
+            &service_run_options, {&ref_buffer_, &shaped_test_buffer}, nullptr);
+      }()));
+
+  float result;
+  CHECK(result_buffer.root_buffer().size() == sizeof(result));
+  stream_->ThenMemcpy(&result, result_buffer.root_buffer(), sizeof(result));
+  TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+  return result < kTolerance;
+}
+
+StatusOr<bool> F16BufferComparator::CompareEqual(
+    se::DeviceMemory<Eigen::half> test_buffer) {
+  TF_ASSIGN_OR_RETURN(auto result, CompareEqualImpl(test_buffer));
+  if (result) {
+    return true;
+  }
+  // Host side code that does the same thing, but report some of the
+  // differences as well.
+  int64 n = test_buffer.ElementCount();
+  std::vector<half> host_ref_buffer(n), host_test_buffer(n);
+  stream_->ThenMemcpy(host_ref_buffer.data(), ref_buffer_.root_buffer(),
+                      ref_buffer_.root_buffer().size());
+  stream_->ThenMemcpy(host_test_buffer.data(), test_buffer, test_buffer.size());
+  TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+
+  const auto canonicalize = [](float a) -> float {
+    constexpr float kBigNumer = 1048576.;
+    constexpr float kMaxFp16Value = 65504.;
+    if (std::isnan(a)) {
+      return kBigNumer;
+    }
+    if (std::isinf(a)) {
+      if (a < 0) {
+        return -(kMaxFp16Value + 1);
+      }
+      return kMaxFp16Value + 1;
+    }
+    return a;
+  };
+  int differences_seen = 0;
+  for (int64 i = 0; i < n && differences_seen < 10; i++) {
+    float original_ref = static_cast<float>(host_ref_buffer[i]);
+    float original_test = static_cast<float>(host_test_buffer[i]);
+    float ref = canonicalize(original_ref);
+    float test = canonicalize(original_test);
+    if (!(std::abs(ref - test) / (std::max(std::abs(ref), std::abs(test)) + 1) <
+          kTolerance)) {
+      differences_seen++;
+      LOG(ERROR) << "Difference at " << i << ": " << original_ref << " vs "
+                 << original_test;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.h b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
new file mode 100644
index 0000000..bf2ba78
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// A fp16 comparator that internally keeps a reference buffer, and compares it
+// against other test buffers.
+class F16BufferComparator {
+ public:
+  F16BufferComparator(const F16BufferComparator&) = delete;
+  F16BufferComparator(F16BufferComparator&&) = default;
+
+  // Creates a new comparator. It internally allocates a buffer initialized by
+  // ref_buffer.
+  static StatusOr<F16BufferComparator> Create(
+      se::DeviceMemory<Eigen::half> ref_buffer, Compiler* compiler,
+      DeviceMemoryAllocator* allocator, se::Stream* stream);
+
+  // Returns true if the internally allocated buffer "compares equal" to
+  // test_buffer. The definition of "equal" is:
+  // * All NaNs equal.
+  // * All infs are treated as 65505 or -65505, so that this checker is tolerant
+  //   to fp16 overflows.
+  // * With NaNs and infs taken care of, a and b compare equal iff:
+  //     abs(a - b) / (max(abs(a), abs(b)) + 1) < tolerance
+  //
+  // See the implementation for the tolerance value.
+  StatusOr<bool> CompareEqual(se::DeviceMemory<Eigen::half> test_buffer);
+
+ private:
+  F16BufferComparator(se::Stream* stream, DeviceMemoryAllocator* allocator,
+                      std::unique_ptr<Executable> exec,
+                      ScopedShapedBuffer ref_buffer)
+      : stream_(stream),
+        allocator_(allocator),
+        exec_(std::move(exec)),
+        ref_buffer_(std::move(ref_buffer)) {}
+
+  StatusOr<bool> CompareEqualImpl(se::DeviceMemory<Eigen::half> test_buffer);
+
+  se::Stream* stream_;
+  DeviceMemoryAllocator* allocator_;
+  std::unique_ptr<Executable> exec_;
+  ScopedShapedBuffer ref_buffer_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
new file mode 100644
index 0000000..33761d1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+
+#include <limits>
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class BufferComparatorTest : public testing::Test {
+ protected:
+  BufferComparatorTest()
+      : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()),
+        stream_exec_(backend_->default_stream_executor()),
+        allocator_(stream_exec_->platform(), {stream_exec_}),
+        compiler_(Compiler::GetForPlatform(stream_exec_->platform())
+                      .ConsumeValueOrDie()) {}
+
+  // Take floats only for convenience. Still uses half internally.
+  bool CompareEqualFloatBuffers(const std::vector<float>& lhs_float,
+                                const std::vector<float>& rhs_float) {
+    std::vector<half> lhs(lhs_float.begin(), lhs_float.end());
+    std::vector<half> rhs(rhs_float.begin(), rhs_float.end());
+    se::Stream stream(stream_exec_);
+    stream.Init();
+
+    auto owning_lhs_buffer =
+        allocator_
+            .Allocate(stream_exec_->device_ordinal(), lhs.size() * sizeof(half))
+            .ConsumeValueOrDie();
+
+    auto owning_rhs_buffer =
+        allocator_
+            .Allocate(stream_exec_->device_ordinal(), rhs.size() * sizeof(half))
+            .ConsumeValueOrDie();
+
+    auto lhs_buffer =
+        se::DeviceMemory<Eigen::half>(owning_lhs_buffer.AsDeviceMemoryBase());
+    auto rhs_buffer =
+        se::DeviceMemory<Eigen::half>(owning_rhs_buffer.AsDeviceMemoryBase());
+
+    stream.ThenMemcpy(&lhs_buffer, lhs.data(), lhs_buffer.size());
+    stream.ThenMemcpy(&rhs_buffer, rhs.data(), rhs_buffer.size());
+
+    TF_CHECK_OK(stream.BlockHostUntilDone());
+
+    return F16BufferComparator::Create(lhs_buffer, compiler_, &allocator_,
+                                       &stream)
+        .ConsumeValueOrDie()
+        .CompareEqual(rhs_buffer)
+        .ConsumeValueOrDie();
+  }
+
+  std::unique_ptr<Backend> backend_;
+  se::StreamExecutor* stream_exec_;
+  StreamExecutorMemoryAllocator allocator_;
+  Compiler* compiler_;
+};
+
+TEST_F(BufferComparatorTest, TestNaNs) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("")}));
+  // NaN values with different bit patterns should compare equal.
+  EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("1234")}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({std::nanf("")}, {1.}));
+}
+
+TEST_F(BufferComparatorTest, TestInfs) {
+  const auto inf = std::numeric_limits<float>::infinity();
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {std::nanf("")}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {inf}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {65504}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({-inf}, {-65504}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-65504}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {65504}));
+
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {-20}));
+}
+
+TEST_F(BufferComparatorTest, TestNumbers) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({20}, {20.1}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({0}, {1}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({0.9}, {1}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({9}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({10}, {9}));
+}
+
+TEST_F(BufferComparatorTest, TestMultiple) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({20, 30, 40, 50, 60},
+                                       {20.1, 30.1, 40.1, 50.1, 60.1}));
+  std::vector<float> lhs(200);
+  std::vector<float> rhs(200);
+  for (int i = 0; i < 200; i++) {
+    EXPECT_TRUE(CompareEqualFloatBuffers(lhs, rhs))
+        << "should be the same at index " << i;
+    lhs[i] = 3;
+    rhs[i] = 5;
+    EXPECT_FALSE(CompareEqualFloatBuffers(lhs, rhs))
+        << "should be the different at index " << i;
+    lhs[i] = 0;
+    rhs[i] = 0;
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 7348307..caeb89d 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -30,7 +31,6 @@
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
-using tensorflow::gtl::nullopt;
 using tensorflow::gtl::optional;
 
 class ScratchAllocator : public se::ScratchAllocator {
@@ -173,11 +173,17 @@
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-optional<std::tuple<int64, bool, int64>>
+StatusOr<std::tuple<int64, bool, int64>>
 CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
     const Shape& output_shape, const Window& window,
     const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+  CHECK_EQ(input_shape.element_type(), filter_shape.element_type());
+  CHECK_EQ(input_shape.element_type(), output_shape.element_type());
+  // TODO(timshen): for now only check fp16. It can be expanded to other types,
+  // with some work on the HLO routines.
+  const bool cross_check_enabled = input_shape.element_type() == xla::F16;
+
   // Don't run this function concurrently on the same GPU.
   //
   // This is a bit of a hack and doesn't protect us against arbitrary concurrent
@@ -206,51 +212,75 @@
   // Allocate space for the input, filter, and output of the convolution.  We
   // use a ScratchAllocator for this instead of calling allocator_ directly so
   // that our allocations don't leak.
-  //
-  // We don't put any data in these buffers, because (in theory, anyway) the
-  // speed of a conv isn't affected by the data being convolved.
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  StatusOr<DeviceMemoryBase> maybe_input_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(input_shape));
-  StatusOr<DeviceMemoryBase> maybe_filter_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(filter_shape));
-  StatusOr<DeviceMemoryBase> maybe_output_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(output_shape));
-  if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() ||
-      !maybe_output_buf.ok()) {
-    LOG(WARNING)
-        << "Couldn't allocate space for input/filter/output of convolution "
-        << instr->ToString() << ".  Falling back to default algorithm.";
-    return nullopt;
-  }
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase input_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(input_shape)));
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase filter_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(filter_shape)));
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase output_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(output_shape)));
 
-  DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie();
-  DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie();
-  DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie();
+  if (cross_check_enabled) {
+    // Broadcast a constant to the buffer, instead of zeroing the buffer. A
+    // non-zero constant is useful for the cross checking, because zero-inputs
+    // may not always reveal the bugs.
+    const auto initialize_f16 = [&stream](DeviceMemoryBase buffer) {
+      CHECK_EQ(0, (uintptr_t)buffer.opaque() % 4);
+      size_t left_over_bytes = buffer.size() % 4;
+      CHECK_EQ(0, left_over_bytes % 2);
 
-  // Although we don't have evidence this matters, zero out the buffers before
-  // autotuning.  It's conceivable that using uninitialized memory as the inputs
-  // might affect performance if e.g. the inputs contain denormals, and this is
-  // easy enough.
-  if (!stream.ThenMemZero(&input_buf, input_buf.size())
-           .ThenMemZero(&filter_buf, filter_buf.size())
-           .ThenMemZero(&output_buf, output_buf.size())
-           .BlockHostUntilDone()
-           .ok()) {
-    LOG(WARNING)
-        << "Couldn't zero out input/filter/output buffer for convolution "
-        << instr->ToString() << ".  Falling back to default algorithm.";
-    return nullopt;
+      constexpr float kBroadcastedConstant = 0.1f;
+      Eigen::half halfs[2] = {Eigen::half(kBroadcastedConstant),
+                              Eigen::half(kBroadcastedConstant)};
+      uint32 bits;
+      static_assert(sizeof(bits) == sizeof(halfs), "");
+      memcpy(&bits, halfs, sizeof(bits));
+
+      size_t aligned_size = buffer.size() / 4 * 4;
+      stream.ThenMemset32(&buffer, bits, aligned_size);
+
+      DeviceMemoryBase left_over(
+          static_cast<char*>(buffer.opaque()) + aligned_size, left_over_bytes);
+      stream.ThenMemcpy(&left_over, halfs, left_over_bytes);
+    };
+    initialize_f16(input_buf);
+    initialize_f16(filter_buf);
+    initialize_f16(output_buf);
+  } else {
+    // Although we don't have evidence this matters, zero out the buffers before
+    // autotuning.  It's conceivable that using uninitialized memory as the
+    // inputs might affect performance if e.g. the inputs contain denormals, and
+    // this is easy enough.
+    stream.ThenMemZero(&input_buf, input_buf.size())
+        .ThenMemZero(&filter_buf, filter_buf.size())
+        .ThenMemZero(&output_buf, output_buf.size());
   }
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+
+  DeviceMemoryBase* result_buf = [&] {
+    switch (kind) {
+      case CudnnConvKind::kBackwardFilter:
+        return &filter_buf;
+      case CudnnConvKind::kBackwardInput:
+        return &input_buf;
+      case CudnnConvKind::kForward:
+        return &output_buf;
+    }
+  }();
 
   const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
       input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
+  optional<F16BufferComparator> comparator;
+  // Use the first algorithm that's supported as reference. There isn't a
+  // particular reason to use it, as any algorithm sufficies. It doesn't make
+  // this algorithm considered correct, though.
+  optional<AlgorithmDesc> first_algorithm;
   for (const AlgorithmDesc& alg :
        GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
@@ -266,6 +296,42 @@
             .ok();
 
     if (launch_ok && profile_result.is_valid()) {
+      const bool crash_on_checking_failure =
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_crash_on_verification_failures();
+      if (comparator.has_value()) {
+        StatusOr<bool> result = comparator->CompareEqual(
+            se::DeviceMemory<Eigen::half>(*result_buf));
+        if (!result.ok()) {
+          LOG(ERROR) << "Unable to compare "
+                     << AlgorithmToString(*first_algorithm) << " against "
+                     << AlgorithmToString(alg) << " for " << instr->ToString()
+                     << ": " << result.status();
+          CHECK(!crash_on_checking_failure);
+        } else if (!result.ValueOrDie()) {
+          LOG(ERROR) << "Results mismatch between different convolution "
+                        "algorithms. This is likely a bug in convolution, or "
+                        "an excessive loss of precision in convolution. "
+                     << instr->ToString() << " for "
+                     << AlgorithmToString(*first_algorithm) << " vs "
+                     << AlgorithmToString(alg);
+          CHECK(!crash_on_checking_failure);
+        }
+      } else if (cross_check_enabled) {
+        auto comp = F16BufferComparator::Create(
+            se::DeviceMemory<Eigen::half>(*result_buf), compiler_, allocator,
+            &stream);
+        if (comp.ok()) {
+          comparator.emplace(comp.ConsumeValueOrDie());
+          first_algorithm.emplace(alg);
+        } else {
+          LOG(ERROR) << "Fail to initialize buffer comparator: "
+                     << comp.status() << ", instruction: " << instr->ToString();
+          CHECK(!crash_on_checking_failure);
+        }
+      }
       int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
       VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
               << " succeeded, taking " << profile_result.elapsed_time_in_ms()
@@ -292,9 +358,10 @@
                            best_result_bytes_used);
   }
 
-  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
-               << " failed.  Falling back to default algorithm.";
-  return nullopt;
+  return InternalError(
+      "All algorithms tried for convolution %s failed.  Falling back to "
+      "default algorithm.",
+      instr->ToString().c_str());
 }
 
 StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
@@ -305,12 +372,13 @@
   const auto& lhs_shape = instr->operand(0)->shape();
   const auto& rhs_shape = instr->operand(1)->shape();
   const auto& conv_result_shape = instr->shape().tuple_shapes(0);
-  optional<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
+  StatusOr<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
   if (call_target == kCudnnConvForwardCallTarget) {
-    alg_scratch_and_tc = PickBestAlgorithm(
-        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
-        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
-        instr->window(), instr->convolution_dimension_numbers(), instr);
+    alg_scratch_and_tc =
+        PickBestAlgorithm(CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
+                          /*filter_shape=*/rhs_shape,
+                          /*output_shape=*/conv_result_shape, instr->window(),
+                          instr->convolution_dimension_numbers(), instr);
   } else if (call_target == kCudnnConvBackwardInputCallTarget) {
     alg_scratch_and_tc = PickBestAlgorithm(
         CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
@@ -326,7 +394,8 @@
                << instr->ToString();
   }
 
-  if (!alg_scratch_and_tc.has_value()) {
+  if (!alg_scratch_and_tc.ok()) {
+    LOG(ERROR) << alg_scratch_and_tc.status();
     return false;
   }
 
@@ -334,7 +403,8 @@
   bool tensor_ops_enabled;
   int64 scratch_bytes;
 
-  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc;
+  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) =
+      alg_scratch_and_tc.ConsumeValueOrDie();
 
   VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
           << NumBytesToString(scratch_bytes)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index bc5d1ce..8b77496 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -16,6 +16,7 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 
+#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -34,8 +35,9 @@
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
-                                  DeviceMemoryAllocator* allocator)
-      : stream_exec_(stream_exec), allocator_(allocator) {}
+                                  DeviceMemoryAllocator* allocator,
+                                  Compiler* compiler)
+      : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
 
   tensorflow::StringPiece name() const override {
     return "cudnn-convolution-algorithm-picker";
@@ -46,13 +48,14 @@
  private:
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  tensorflow::gtl::optional<std::tuple<int64, bool, int64>> PickBestAlgorithm(
+  StatusOr<std::tuple<int64, bool, int64>> PickBestAlgorithm(
       CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
       const Shape& output_shape, const Window& window,
       const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
+  Compiler* compiler_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index 0645fbb..7b0d9e5 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -96,15 +96,9 @@
   // tensorflow/python/ops/nn_ops.py).
   const int effective_num_dimensions = std::max(2, num_dimensions);
 
-  if (std::is_same<T, float>::value) {
-    CHECK_EQ(F32, output_shape.element_type())
-        << ShapeUtil::HumanString(output_shape);
-  } else if (std::is_same<T, Eigen::half>::value) {
-    CHECK_EQ(F16, output_shape.element_type())
-        << ShapeUtil::HumanString(output_shape);
-  } else {
-    LOG(FATAL) << ShapeUtil::HumanString(output_shape);
-  }
+  CHECK_EQ(primitive_util::NativeToPrimitiveType<T>(),
+           output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
 
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
@@ -246,21 +240,31 @@
     se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
     se::dnn::ProfileResult* profile_result) {
   PrimitiveType output_primitive_type = output_shape.element_type();
-  CHECK(output_primitive_type == F32 || output_primitive_type == F16)
-      << ShapeUtil::HumanString(output_shape);
-  if (output_primitive_type == F32) {
-    return RunCudnnConvolution(
-        kind, input_shape, filter_shape, output_shape,
-        se::DeviceMemory<float>(input_buf), se::DeviceMemory<float>(filter_buf),
-        se::DeviceMemory<float>(output_buf), scratch_allocator, window, dnums,
-        algorithm, stream, profile_result);
+  switch (output_primitive_type) {
+    case F16:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<Eigen::half>(input_buf),
+                                 se::DeviceMemory<Eigen::half>(filter_buf),
+                                 se::DeviceMemory<Eigen::half>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    case F32:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<float>(input_buf),
+                                 se::DeviceMemory<float>(filter_buf),
+                                 se::DeviceMemory<float>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    case F64:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<double>(input_buf),
+                                 se::DeviceMemory<double>(filter_buf),
+                                 se::DeviceMemory<double>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    default:
+      LOG(FATAL) << ShapeUtil::HumanString(output_shape);
   }
-  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
-                             se::DeviceMemory<Eigen::half>(input_buf),
-                             se::DeviceMemory<Eigen::half>(filter_buf),
-                             se::DeviceMemory<Eigen::half>(output_buf),
-                             scratch_allocator, window, dnums, algorithm,
-                             stream, profile_result);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index cc38db2..9b6de11 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -210,11 +210,13 @@
     return make_sqrt();
   }
 
-  if (hlo_module_config_.debug_options().xla_enable_fast_math() &&
-      IsFPLiteralWithValue(rhs, -.5)) {
+  if (IsFPLiteralWithValue(rhs, -.5)) {
     VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString();
     // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
     // rsqrt.approx instruction.
+    //
+    // TODO(jlebar): Does this happen with fastmath disabled?  If not, should
+    // we force-enable it?
     TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt());
     return b_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
   }
@@ -272,27 +274,20 @@
                                prim_type);
 }
 
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
-    const HloInstruction* op, llvm::Value* operand_value) const {
-  PrimitiveType input_type = op->operand(0)->shape().element_type();
-  PrimitiveType output_type = op->shape().element_type();
-  switch (op->opcode()) {
-    case HloOpcode::kTanh:
-      // If we don't care much about precision, emit a fast approximation of
-      // tanh.
-      if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
-        // Upcast F16 to F32 if necessary.
-        llvm::Type* type =
-            input_type == F16 ? b_->getFloatTy() : operand_value->getType();
-        llvm::Value* input = b_->CreateFPCast(operand_value, type);
-        llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
-        return b_->CreateFPCast(fast_tanh, operand_value->getType());
-      }
-      return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
-                                   output_type);
-    default:
-      return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
-  }
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  // Emit a fast approximation of tanh instead of calling __nv_tanh.
+  // __nv_tanh is particularly bad because it contains branches, thus
+  // preventing LLVM's load-store vectorizer from working its magic across a
+  // function which contains tanh calls.
+  //
+  // This routine isn't numerically precise, but it's good enough for ML.
+
+  // Upcast F16 to F32 if necessary.
+  llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType();
+  llvm::Value* input = b_->CreateFPCast(value, type);
+  llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
+  return b_->CreateFPCast(fast_tanh, value->getType());
 }
 
 llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
@@ -445,6 +440,8 @@
         return b_->CreateLoad(accum_ptr);
       };
     case HloOpcode::kReduce:
+      // TODO(b/112040122): This should be supported.
+      CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
         const HloInstruction* operand = hlo->operand(0);
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index e3eacef..84454d3 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -51,9 +51,6 @@
       const HloToElementGeneratorMap& operand_to_generator) const override;
 
  protected:
-  StatusOr<llvm::Value*> EmitFloatUnaryOp(
-      const HloInstruction* op, llvm::Value* operand_value) const override;
-
   StatusOr<llvm::Value*> EmitFloatBinaryOp(
       const HloInstruction* op, llvm::Value* lhs_value,
       llvm::Value* rhs_value) const override;
@@ -85,6 +82,9 @@
   StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
                                    llvm::Value* rhs) const override;
 
+  StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
+                                  llvm::Value* value) const override;
+
   llvm::Value* EmitThreadId() const override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 939c7f8..12c81f9 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -52,12 +52,12 @@
                          se::Stream* stream,
                          HloExecutionProfiler* profiler) override;
 
-  // Returns true if we'll perform autotuning if run on the given stream.  If
-  // so, we want the GPU to be quiescent during autotuning, so as not to
-  // introduce noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
-    return autotune_results_.count(
-               stream->parent()->GetDeviceDescription().name()) != 0;
+  bool WillAutotuneKernel(se::Stream* stream) override {
+    // We will autotune this kernel if we don't already have a autotune result
+    // for the stream device.
+    return autotune_results_.find(
+               stream->parent()->GetDeviceDescription().name()) ==
+           autotune_results_.end();
   }
 
  private:
@@ -75,6 +75,8 @@
   // results.  The map's value is the best algorithm we've found for this thunk
   // on this device, or an error if none of the algorithms worked and we should
   // use the regular gemm without an algorithm.
+  //
+  // TODO(b/112415150):  Make this thread safe.
   std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
       autotune_results_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index bb7736e..7060837 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -131,9 +131,10 @@
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
-    // If this thunk requests it, wait for all currently-executing thunks to
-    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
-    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
+    // If this thunk is about to autotune then wait for all currently executing
+    // thunks to finish.  This reduces noise and thus the probability of
+    // choosing a suboptimal algorithm.
+    if (thunk->WillAutotuneKernel(stream)) {
       TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 79b3f1e..a2f53f8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -117,38 +117,37 @@
   return std::move(buffer);
 }
 
-static std::unique_ptr<Literal> ShapeTreeToLiteral(
+static void ShapeTreeToLiteral(
     ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree) {
   // This is a struct instead of a lambda for std::function-free recursion.
   struct Helper {
-    static std::unique_ptr<Literal> helper(
+    static void helper(
         ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree,
         ShapeIndex* index) {
       const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index);
       if (ShapeUtil::IsArray(shape)) {
-        return (*shape_tree->mutable_element(*index))->WaitUntilAvailable();
+        (*shape_tree->mutable_element(*index))->WaitUntilAvailable();
+        return;
       }
 
       CHECK(ShapeUtil::IsTuple(shape))
           << ShapeUtil::HumanStringWithLayout(shape);
       const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape);
       index->push_back(0);
-      std::vector<std::unique_ptr<Literal>> tuple_operands;
       for (int64 i = 0; i < tuple_element_count; ++i) {
         index->back() = i;
-        tuple_operands.push_back(helper(shape_tree, index));
+        helper(shape_tree, index);
       }
       index->pop_back();
-      return LiteralUtil::MakeTupleOwned(std::move(tuple_operands));
     }
   };
   ShapeIndex index;
-  return Helper::helper(shape_tree, &index);
+  Helper::helper(shape_tree, &index);
 }
 
 Status GpuTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* /*executor*/, const Shape& literal_shape,
-    Literal* literal) {
+    MutableBorrowingLiteral literal) {
   ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>> outfeed_buffers(
       &literal_shape);
 
@@ -162,6 +161,8 @@
           return;
         }
         *buffer = MakeUnique<gpu::OutfeedBuffer>(GetByteSizeRequirement(shape));
+        (*buffer)->set_destination(
+            MakeUnique<MutableBorrowingLiteral>(literal, index));
       });
 
   // Give the tree of buffers to the outfeed mananger. The device will fill it
@@ -169,8 +170,8 @@
   gpu::OutfeedManager* outfeed_manager = gpu::GetOrCreateOutfeedManager();
   outfeed_manager->EnqueueDestination(&outfeed_buffers);
 
-  // Now turn the tree of buffers back into a literal.
-  *literal = std::move(*ShapeTreeToLiteral(&outfeed_buffers));
+  // Now wait for the tree of buffers are written.
+  ShapeTreeToLiteral(&outfeed_buffers);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index dceeb9e..7929042 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -42,7 +42,7 @@
                                  const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
-                                    Literal* literal) override;
+                                    MutableBorrowingLiteral literal) override;
 
  private:
   // Initiates the infeed data transfers. InfeedBuffer->Done() must be
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 541cacf..6675dbd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -64,7 +64,7 @@
       hlo_module_config_(hlo_module_config) {
   b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_enable_fast_math()));
+          .xla_gpu_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
@@ -632,6 +632,10 @@
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
+  // TODO(b/112040122): Support variadic reduce.
+  if (!ShapeUtil::IsArray(reduce->shape())) {
+    return Unimplemented("Variadic reduce is not supported on GPU");
+  }
   auto arg = reduce->operand(0);
   auto init_value = reduce->operand(1);
   tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d5ecae8..1e81cbd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -56,7 +56,6 @@
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/while_transformer.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -68,6 +67,7 @@
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -545,6 +545,11 @@
     switch (root->opcode()) {
       case HloOpcode::kTuple:
       case HloOpcode::kReduce: {
+        if (root->opcode() == HloOpcode::kReduce &&
+            ShapeUtil::IsTuple(root->shape())) {
+          // TODO(b/112040122): Support variadic reduce.
+          return Unimplemented("Variadic reduce is not supported on GPU");
+        }
         VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
         std::vector<std::unique_ptr<Thunk>> thunks;
         ArraySlice<HloInstruction*> output_instructions =
@@ -1694,6 +1699,10 @@
 }
 
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
+  // TODO(b/112040122): Support multi-output reduce.
+  if (!ShapeUtil::IsArray(reduce->shape())) {
+    return Unimplemented("Multi-output reduce is not supported on GPU");
+  }
   auto input = reduce->operand(0);
   auto init_value = reduce->operand(1);
   tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce(reduce->dimensions());
@@ -1963,19 +1972,13 @@
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
   // Build ForThunk for conformant while loops, otherwise build WhileThunk.
-  auto result = CanTransformWhileToFor(xla_while);
-  if (result.ok()) {
-    auto tuple = result.ConsumeValueOrDie();
-    // loop_trip_count = (limit - start + increment - 1) / increment
-    const int64 loop_trip_count =
-        (std::get<1>(tuple) - std::get<0>(tuple) + std::get<2>(tuple) - 1) /
-        std::get<2>(tuple);
-    thunk_sequence_->emplace_back(BuildForThunk(xla_while, loop_trip_count));
+  // TODO(b/112163966): Move trip count computation earlier in the pipeline.
+  if (auto loop_trip_count = ComputeWhileLoopTripCount(xla_while)) {
+    thunk_sequence_->emplace_back(BuildForThunk(xla_while, *loop_trip_count));
     VLOG(3) << "Built ForThunk for while: " << xla_while->name();
   } else {
     thunk_sequence_->emplace_back(BuildWhileThunk(xla_while));
-    VLOG(3) << "Built WhileThunk for while: " << xla_while->name()
-            << " while-to-for transform status: " << result.status();
+    VLOG(3) << "Built WhileThunk for while: " << xla_while->name();
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index cf44458..ff4ae1f 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -180,7 +180,7 @@
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
   llvm_ir::SetTargetOptions(
       /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_enable_fast_math(),
+          .xla_gpu_enable_fast_math(),
       &target_options);
 
   // Enable FMA synthesis.
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 76c9b6a..6c1eab4 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -34,6 +34,7 @@
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
@@ -72,6 +73,7 @@
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -130,8 +132,12 @@
 }
 
 // Runs optimization passes on the given HLO module.
+//
+// It takes a compiler pointer, as passes may compile and execute HLOs on the
+// fly for cuDNN verification or other purposes.
 Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         DeviceMemoryAllocator* device_allocator) {
+                         DeviceMemoryAllocator* device_allocator,
+                         Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
@@ -167,6 +173,8 @@
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
+      pipeline.AddPass<ScatterExpander>();
+
       pass.AddPass<AlgebraicSimplifier>(
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
@@ -196,6 +204,8 @@
     // (PadInsertion).
     HloPassPipeline pipeline("conv_canonicalization");
     pipeline.AddInvariantChecker<HloVerifier>();
+    // TODO(b/31709653): Directly use the grouped convolution support of Cudnn.
+    pipeline.AddPass<ConvolutionFeatureGroupConverter>();
     pipeline.AddPass<CudnnConvolutionRewriter>();
     pipeline.AddPass<PadInsertion>();
     if (IsVoltaOrLater(*stream_exec)) {
@@ -245,8 +255,8 @@
     // the gte(customcall, 0) would probably already be into a fusion node.  We
     // can't simplify across HloComputation boundaries, so in this case we
     // wouldn't be able to simplify away the new_tuple bits.
-    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
-                                                      device_allocator);
+    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(
+        stream_exec, device_allocator, compiler);
     // Clean up new_tuple described above.
     pipeline.AddPass<TupleSimplifier>();
 
@@ -492,11 +502,15 @@
 StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
+  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
+  VLOG(2) << "*** HLO Before Optimization";
+  XLA_VLOG_LINES(2, module->ToString());
+
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tracing::ScopedActivity activity("HLO Transforms", module->name(),
                                    /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
-      OptimizeHloModule(module.get(), stream_exec, device_allocator));
+      OptimizeHloModule(module.get(), stream_exec, device_allocator, this));
   return std::move(module);
 }
 
@@ -548,6 +562,7 @@
   // include headers, so no need for us to print them ourselves.
   XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
+  VLOG(2) << "*** HLO After Optimization";
   XLA_VLOG_LINES(2, module->ToString());
   const string xla_dump_optimized_hlo_proto_to =
       module->config().debug_options().xla_dump_optimized_hlo_proto_to();
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_manager.h b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h
index a752eb7..160ba4b 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h
@@ -36,22 +36,19 @@
   OutfeedBuffer(int64 length) : length_(length) {}
 
   // Waits for the device transfer to be finished.
-  std::unique_ptr<Literal> WaitUntilAvailable() {
-    done_.WaitForNotification();
-    return std::move(destination_);
-  }
+  void WaitUntilAvailable() { done_.WaitForNotification(); }
 
   int64 length() const { return length_; }
-  void set_destination(std::unique_ptr<Literal> destination) {
+  void set_destination(std::unique_ptr<MutableBorrowingLiteral> destination) {
     destination_ = std::move(destination);
   }
-  Literal* destination() { return destination_.get(); }
+  MutableBorrowingLiteral* destination() { return destination_.get(); }
 
   // Callback to signal that this buffer is consumed.
   void Done() { done_.Notify(); }
 
  private:
-  std::unique_ptr<Literal> destination_;
+  std::unique_ptr<MutableBorrowingLiteral> destination_;
   const int64 length_;
   tensorflow::Notification done_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index 7986e63..b99d998 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -50,10 +50,6 @@
         if (!*buffer) {  // Tuple pointers.
           return Status::OK();
         }
-        // Allocate storage for the literal data.
-        const Shape& shape =
-            ShapeUtil::GetSubshape(outfeed_buffers->shape(), index);
-        (*buffer)->set_destination(Literal::CreateFromShape(shape));
 
         BufferAllocation::Slice slice = outfeed_slices_.element(index);
         se::DeviceMemoryBase data_address;
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 4df0bb0..e68bee0 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -82,17 +82,9 @@
     return Status::OK();
   }
 
-  // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
-  // before calling ExecuteOnStream(stream).  If it returns true, it's the
-  // user's responsibility to wait for all activity on the GPU to finish before
-  // calling ExecuteOnStream.
-  //
-  // This value is not required to be constant for a given Thunk.  For example,
-  // a Thunk that performs autotuning may return true for its first run and
-  // false thereafter.
-  virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
-    return false;
-  }
+  // Returns true if this kernel will autotune for the stream device the next
+  // time it is run.
+  virtual bool WillAutotuneKernel(se::Stream* /*stream*/) { return false; }
 
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
deleted file mode 100644
index c5321df..0000000
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ /dev/null
@@ -1,521 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/while_transformer.h"
-
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-// TODO(b/33483676) Use an expression tree to specify computations to pattern
-// match for while transformations.
-
-// ExprTree is a simple recursive data structure used to express computation
-// patterns to match.
-//
-// Each ExprTree node is comprised of an HloOpcode, and a set of operands (each
-// of type ExprTree). Operands can be added by specifying the index and
-// HloOpcode of the operand.
-//
-// For example, the following computation:
-//
-//            Parameter
-//               |
-//   Const  GetTupleElement
-//      \   /
-//       Add (root)
-//
-// Can be matched with the following expression tree:
-//
-//   ExprTree add(HloOpcode::kAdd,
-//                ExprTree(HloOpcode::kConstant),
-//                ExprTree(HloOpcode::kGetTupleElement,
-//                         tuple_index, ExprTree(HloOpcode::kParameter)));
-//
-// Match the ExprTree root against an Hlo graph:
-//
-//   ExprTree::TaggedInstructionMap tagged_instructions;
-//   TF_RETURN_IF_ERROR(add.Match(computation_->root_instruction(),
-//                                &tagged_instructions));
-//
-// Instructions that are "tagged" with a context-specific string will
-// be returned in 'tagged_instructions' for further processing (i.e. parsing
-// constants or recording the tuple_index).
-//
-class ExprTree {
- public:
-  explicit ExprTree(HloOpcode opcode) : opcode_(opcode) {}
-  ExprTree(HloOpcode opcode, const string& tag) : opcode_(opcode), tag_(tag) {}
-  ExprTree(HloOpcode opcode, const ExprTree& operand0) : opcode_(opcode) {
-    SetOperand(0, operand0);
-  }
-  ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0)
-      : opcode_(opcode) {
-    SetOperand(index0, operand0);
-  }
-  ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0,
-           int64 index1, const ExprTree& operand1)
-      : opcode_(opcode) {
-    SetOperand(index0, operand0);
-    SetOperand(index1, operand1);
-  }
-  ExprTree(HloOpcode opcode, const string& tag, const ExprTree& operand0)
-      : opcode_(opcode), tag_(tag) {
-    SetOperand(0, operand0);
-  }
-  ExprTree(HloOpcode opcode, const ExprTree& operand0, const ExprTree& operand1)
-      : opcode_(opcode) {
-    SetOperand(0, operand0);
-    SetOperand(1, operand1);
-  }
-
-  ExprTree(const ExprTree& to_copy) {
-    opcode_ = to_copy.opcode_;
-    tag_ = to_copy.tag_;
-    if (to_copy.fused_root_tree_ != nullptr) {
-      fused_root_tree_.reset(new ExprTree(*to_copy.fused_root_tree_));
-    }
-    for (auto& pair : to_copy.operands_) {
-      CHECK(operands_.find(pair.first) == operands_.end());
-      operands_.insert(std::make_pair(
-          pair.first, std::unique_ptr<ExprTree>(new ExprTree(*pair.second))));
-    }
-  }
-
-  void SetFusedRoot(const ExprTree& fused_root) {
-    fused_root_tree_.reset(new ExprTree(fused_root));
-  }
-
-  typedef std::unordered_map<string, const HloInstruction*>
-      TaggedInstructionMap;
-
-  // Matches 'instruction' HloOpcode against 'opcode_'.
-  // Recursively matches each operand in 'operands_'.
-  // Recursively matches fused instructions starting at 'fused_root_tree_'
-  // if 'opcode_ == kFusion'.
-  // Returns OK status, and instructions in 'tagged_instructions' for each
-  // matched ExprTree node with a non-empty 'tag_'.
-  // Returns error message on failure.
-  Status Match(const HloInstruction* instruction,
-               TaggedInstructionMap* tagged_instructions) const {
-    if (opcode_ != instruction->opcode()) {
-      return InvalidArgument("got opcode %s, want %s",
-                             HloOpcodeString(instruction->opcode()).c_str(),
-                             HloOpcodeString(opcode_).c_str());
-    }
-
-    VLOG(2) << "Matched " << HloOpcodeString(opcode_) << ": " << tag_;
-    if (!tag_.empty()) {
-      tagged_instructions->insert({tag_, instruction});
-    }
-
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      CHECK(fused_root_tree_ != nullptr);
-      // Match fused instructions for this node starting a 'fused_root_tree'.
-      TF_RETURN_IF_ERROR(fused_root_tree_->Match(
-          instruction->fused_expression_root(), tagged_instructions));
-    }
-
-    // Match each operand in 'operands_'.
-    for (auto& pair : operands_) {
-      TF_RETURN_IF_ERROR(pair.second->Match(instruction->operand(pair.first),
-                                            tagged_instructions));
-    }
-    return Status::OK();
-  }
-
- private:
-  void SetOperand(int64 index, const ExprTree& operand) {
-    CHECK_EQ(0, operands_.count(index));
-    operands_.insert(std::make_pair(index, MakeUnique<ExprTree>(operand)));
-  }
-
-  HloOpcode opcode_;
-  std::unordered_map<int64, std::unique_ptr<ExprTree>> operands_;
-  std::unique_ptr<ExprTree> fused_root_tree_;
-  string tag_;
-};
-
-// MatcherBase is a base class that provides common functionality for
-// sub-classes which match specific target sub-computations (i.e. loop
-// induction variable initialization, comparison and update).
-class MatcherBase {
- public:
-  MatcherBase() {}
-  virtual ~MatcherBase() {}
-
-  // Attempts to match each ExprTree in 'expr_trees_'.
-  // Returns OK on the first successful match, error status otherwise.
-  virtual Status Run() {
-    Status status;
-    for (const ExprTree& expr_tree : expr_trees_) {
-      status = MatchExprTree(expr_tree);
-      if (status.ok()) {
-        return status;
-      }
-    }
-    return status;
-  }
-
-  virtual Status MatchExprTree(const ExprTree& expr_tree) = 0;
-
-  // Returns the constant value parsed form kConstant 'instruction'.
-  // Returns error status otherwise.
-  Status ParseConstInteger(const HloInstruction* instruction,
-                           int64* const_value) const {
-    CHECK_EQ(HloOpcode::kConstant, instruction->opcode());
-    PrimitiveType element_type = instruction->shape().element_type();
-    if (element_type != S32 && element_type != S64) {
-      return InvalidArgument("Expected constant of integral type.");
-    }
-    const Literal& literal = instruction->literal();
-    PrimitiveType type = literal.shape().element_type();
-    if (type != S32 && type != S64) {
-      return InvalidArgument("Must use S32 or S64 integral types.");
-    }
-    if (type == S32) {
-      *const_value = static_cast<int64>(literal.GetFirstElement<int32>());
-    } else if (type == S64) {
-      *const_value = literal.GetFirstElement<int64>();
-    }
-    return Status::OK();
-  }
-
-  StatusOr<const HloInstruction*> GetTaggedInstruction(
-      const string& tag,
-      const ExprTree::TaggedInstructionMap& tagged_instructions) {
-    auto it = tagged_instructions.find(tag);
-    if (it == tagged_instructions.end()) {
-      return InvalidArgument("Cound not find instruction for tag: %s",
-                             tag.c_str());
-    }
-    return it->second;
-  }
-
- protected:
-  std::vector<ExprTree> expr_trees_;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(MatcherBase);
-};
-
-// WhileConditionComputationMatcher attempts to match a target computation
-// pattern in the while condition sub-computation.
-// If the target pattern is matched, two pieces of information are extracted
-// from 'tagged' instructions returned by the matcher:
-//
-// *) 'tuple_index':
-//    *) The loop induction variable tuple_index from the GetTupleElement
-//       instruction of the matched computation.
-//    *) Used in subsequent matching passes of while init operand and body
-//       computations to select loop induction variable tuple element.
-//
-// *) 'loop_limit':
-//    *) The integral value from Constant root operand in matched computation.
-//    *) Used as the constant for the loop limit.
-//
-class WhileConditionComputationMatcher : public MatcherBase {
- public:
-  explicit WhileConditionComputationMatcher(const HloComputation* computation)
-      : computation_(computation) {
-    expr_trees_.emplace_back(BuildCondExprTree());
-  }
-
-  int64 loop_limit() const { return loop_limit_; }
-  int64 tuple_index() const { return tuple_index_; }
-
- private:
-  // Builds expression tree for the following condition computation:
-  //
-  //     Const  Parameter
-  //        \     /
-  //         Fusion ------------> FusionParam FusionParam
-  //                                  \          /
-  //                                  GTE       /
-  //                                    \      /
-  //                                    LessThan (fused root)
-  //
-  ExprTree BuildCondExprTree() {
-    // Build ExprTree for fused instructions.
-    ExprTree fused_root(
-        HloOpcode::kLt,
-        ExprTree(HloOpcode::kGetTupleElement, "gte",
-                 ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0")),
-        ExprTree(HloOpcode::kParameter));
-
-    // Build top-level computation.
-    ExprTree root(HloOpcode::kFusion,
-                  ExprTree(HloOpcode::kConstant, "loop_limit"),
-                  ExprTree(HloOpcode::kParameter, "param0"));
-
-    root.SetFusedRoot(fused_root);
-    return root;
-  }
-
-  Status MatchExprTree(const ExprTree& expr_tree) override {
-    VLOG(2) << "MATCHING while condition";
-    ExprTree::TaggedInstructionMap tagged_instructions;
-    TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
-                                       &tagged_instructions));
-
-    // Get tagged GTE instruction and set 'tuple_index_'.
-    TF_ASSIGN_OR_RETURN(const HloInstruction* gte,
-                        GetTaggedInstruction("gte", tagged_instructions));
-    tuple_index_ = gte->tuple_index();
-
-    // Get tagged Constant instruction and parse 'loop_limit_'.
-    TF_ASSIGN_OR_RETURN(
-        const HloInstruction* const_hlo,
-        GetTaggedInstruction("loop_limit", tagged_instructions));
-    TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_limit_));
-
-    // Get tagged "param0" instruction, and check that it matches
-    // 'computation_' parameter 0.
-    TF_ASSIGN_OR_RETURN(const HloInstruction* param0,
-                        GetTaggedInstruction("param0", tagged_instructions));
-    if (param0 != computation_->parameter_instruction(0)) {
-      return InvalidArgument("Unexpected Parameter0 instruction : %s",
-                             param0->name().c_str());
-    }
-
-    // Get tagged 'gte.fusion_param.param0', find its associated fusion operand,
-    // and compare it to 'computation_' parameter0.
-    TF_ASSIGN_OR_RETURN(
-        const HloInstruction* gte_fusion_param0,
-        GetTaggedInstruction("gte.fusion_param.param0", tagged_instructions));
-    CHECK_EQ(HloOpcode::kParameter, gte_fusion_param0->opcode());
-    CHECK(gte_fusion_param0->IsFused());
-    if (gte_fusion_param0->parent()->FusionInstruction()->operand(
-            gte_fusion_param0->parameter_number()) !=
-        computation_->parameter_instruction(0)) {
-      return InvalidArgument("Could not match fusion param: %s",
-                             gte_fusion_param0->name().c_str());
-    }
-
-    return Status::OK();
-  }
-
-  const HloComputation* computation_;
-
-  int64 loop_limit_ = -1;
-  int64 tuple_index_ = -1;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(WhileConditionComputationMatcher);
-};
-
-// WhileInitOperandMatcher matches a target computation pattern of the
-// while instructions 'init' operand, indexing the tuple at 'tuple_index'.
-// On success, parses constant 'loop_start' which represents the loop induction
-// variable start values, then returns OK.
-// Returns error status otherwise.
-class WhileInitOperandMatcher : public MatcherBase {
- public:
-  WhileInitOperandMatcher(const HloInstruction* while_hlo,
-                          const int64 tuple_index)
-      : while_hlo_(while_hlo), tuple_index_(tuple_index) {
-    expr_trees_.emplace_back(BuildInitExprTree());
-  }
-
-  int64 loop_start() const { return loop_start_; }
-
- private:
-  // Builds expression tree for the following while init operand subcomputation:
-  //
-  //             Const
-  //               |
-  //             Copy
-  //               |
-  //             Tuple0
-  //               |
-  //             While
-  //
-  ExprTree BuildInitExprTree() {
-    return ExprTree(
-        HloOpcode::kWhile, "while",
-        ExprTree(HloOpcode::kTuple, tuple_index_,
-                 ExprTree(HloOpcode::kCopy,
-                          ExprTree(HloOpcode::kConstant, "loop_start"))));
-  }
-
-  Status MatchExprTree(const ExprTree& expr_tree) override {
-    VLOG(2) << "MATCHING while init";
-    ExprTree::TaggedInstructionMap tagged_instructions;
-    TF_RETURN_IF_ERROR(expr_tree.Match(while_hlo_, &tagged_instructions));
-
-    // Get tagged while instruction check against 'while_hlo_'.
-    TF_ASSIGN_OR_RETURN(const HloInstruction* while_hlo,
-                        GetTaggedInstruction("while", tagged_instructions));
-    if (while_hlo != while_hlo_) {
-      return InvalidArgument("Expected While for instruction : %s",
-                             while_hlo->name().c_str());
-    }
-
-    // Get tagged Constant instruction and parse 'loop_start_'.
-    TF_ASSIGN_OR_RETURN(
-        const HloInstruction* const_hlo,
-        GetTaggedInstruction("loop_start", tagged_instructions));
-    TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_start_));
-
-    return Status::OK();
-  }
-
-  const HloInstruction* while_hlo_;
-  const int64 tuple_index_;
-
-  int64 loop_start_ = -1;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(WhileInitOperandMatcher);
-};
-
-// WhileBodyComputationMatcher matches a target computation pattern for
-// the loop induction variable update. Matching proceeds from the while body
-// computation root[tuple_index] to param[tuple_index], where 'tuple_index'
-// If the target pattern is matched, parses a constant which represents the
-// loop induction variable increment value, then returns status OK.
-// Returns error status otherwise.
-class WhileBodyComputationMatcher : public MatcherBase {
- public:
-  WhileBodyComputationMatcher(const HloComputation* computation,
-                              const int64 tuple_index)
-      : computation_(computation), tuple_index_(tuple_index) {
-    expr_trees_.emplace_back(BuildBodyExprTree(0, 1));
-    expr_trees_.emplace_back(BuildBodyExprTree(1, 0));
-  }
-
-  int64 loop_increment() const { return loop_increment_; }
-
- private:
-  // Builds expression tree for the following while body computation:
-  //
-  //
-  //                               FusionParam FusionParam
-  //                                     \      /
-  //                  Const Param         \   GTE1
-  //                     \  /              \  /
-  //                    Fusion -----------> Add
-  //                      |
-  //                     Copy
-  //                      |
-  //                     Tuple0
-  //
-  ExprTree BuildBodyExprTree(const int64 const_index, const int64 gte_index) {
-    // Build ExprTree for fused instructions.
-    ExprTree gte1 =
-        ExprTree(HloOpcode::kGetTupleElement, "gte",
-                 ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0"));
-    ExprTree fused_root(HloOpcode::kAdd, const_index,
-                        ExprTree(HloOpcode::kParameter), gte_index, gte1);
-
-    // Build fusion instruction (and set fused root).
-    ExprTree fusion(HloOpcode::kFusion, 0,
-                    ExprTree(HloOpcode::kConstant, "loop_increment"), 1,
-                    ExprTree(HloOpcode::kParameter, "param0"));
-    fusion.SetFusedRoot(fused_root);
-
-    // Build top-level computation.
-    ExprTree tuple0(HloOpcode::kTuple, tuple_index_,
-                    ExprTree(HloOpcode::kCopy, fusion));
-    return tuple0;
-  }
-
-  Status MatchExprTree(const ExprTree& expr_tree) override {
-    VLOG(2) << "MATCHING while body";
-    ExprTree::TaggedInstructionMap tagged_instructions;
-    TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
-                                       &tagged_instructions));
-
-    for (const auto& pair : tagged_instructions) {
-      const auto& tag = pair.first;
-      const auto& inst = pair.second;
-
-      if (tag == "gte" && inst->tuple_index() != tuple_index_) {
-        // Check that the matched GTE instruction is at the 'tuple_index' we
-        // matched in the while condition computation.
-        return InvalidArgument("Unexpected tuple index instruction : %s",
-                               inst->name().c_str());
-      } else if (tag == "loop_increment") {
-        // ParseHloString the constant which represents the loop induction
-        // variable increment value.
-        TF_RETURN_IF_ERROR(ParseConstInteger(inst, &loop_increment_));
-      } else if (tag == "param0" &&
-                 inst != computation_->parameter_instruction(0)) {
-        // Check that the matched parameter == parameter 0 from 'computation_'.
-        return InvalidArgument("Unexpected Parameter0 instruction : %s",
-                               inst->name().c_str());
-      } else if (tag == "gte.fusion_param.param0") {
-        // Fusion parameter: lookup and compare with associated fusion operand.
-        CHECK_EQ(HloOpcode::kParameter, inst->opcode());
-        CHECK(inst->IsFused());
-        if (inst->parent()->FusionInstruction()->operand(
-                inst->parameter_number()) !=
-            computation_->parameter_instruction(0)) {
-          return InvalidArgument("Could not match fusion param: %s",
-                                 inst->name().c_str());
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  const HloComputation* computation_;
-  const int64 tuple_index_;
-
-  int64 loop_increment_ = -1;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(WhileBodyComputationMatcher);
-};
-
-}  // namespace
-
-StatusOr<std::tuple<int64, int64, int64>> CanTransformWhileToFor(
-    const HloInstruction* while_hlo) {
-  if (while_hlo->opcode() != HloOpcode::kWhile) {
-    return InvalidArgument("Expected While instruction.");
-  }
-
-  WhileConditionComputationMatcher cond_matcher(while_hlo->while_condition());
-  TF_RETURN_IF_ERROR(cond_matcher.Run());
-
-  WhileInitOperandMatcher init_matcher(while_hlo, cond_matcher.tuple_index());
-  TF_RETURN_IF_ERROR(init_matcher.Run());
-
-  WhileBodyComputationMatcher body_matcher(while_hlo->while_body(),
-                                           cond_matcher.tuple_index());
-  TF_RETURN_IF_ERROR(body_matcher.Run());
-
-  // Check for valid For loop parameters.
-  if (init_matcher.loop_start() >= cond_matcher.loop_limit()) {
-    return InvalidArgument("Loop start must be less than loop limit.");
-  }
-  if (body_matcher.loop_increment() <= 0) {
-    return InvalidArgument("Loop increment must greater than zero.");
-  }
-  return std::make_tuple(init_matcher.loop_start(), cond_matcher.loop_limit(),
-                         body_matcher.loop_increment());
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.h b/tensorflow/compiler/xla/service/gpu/while_transformer.h
deleted file mode 100644
index fe3a954..0000000
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-// Runs an analysis of the while loop instruction 'while_hlo' (and its
-// associated sub-computations) to determine if it can be transformed into an
-// equivalent "for" loop with the following "for" loop parameters:
-//
-// *) 'loop_start': loop induction variable starting value.
-// *) 'loop_limit': loop induction variable limit value.
-// *) 'loop_increment': loop induction variable per-iteration increment value.
-//
-// Returns an std::tuple = (loop_start, loop_limit, loop_increment) on success.
-// The values in the returned tuple are values extracted from the 'while_hlo'
-// operand (and its sub-computations) during analysis.
-// Returns an error status on failure.
-StatusOr<std::tuple<int64, int64, int64>> CanTransformWhileToFor(
-    const HloInstruction* while_hlo);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index dbc8442..c5f3906 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -13,11 +13,10 @@
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/while_transformer.h"
-
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -110,12 +109,12 @@
 
   void RunFusionPasses() {
     // Run standard fusion passes.
-    EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/false)
-                    .Run(module_.get())
-                    .ValueOrDie());
-    EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/true)
-                    .Run(module_.get())
-                    .ValueOrDie());
+    TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/false)
+                     .Run(module_.get())
+                     .status());
+    TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/true)
+                     .Run(module_.get())
+                     .status());
   }
 
   void RunCopyInsertionPass() {
@@ -141,10 +140,7 @@
   Shape condition_result_shape_;
 };
 
-// TODO(b/68830972): The while transformer is far too fragile. It patterns
-// matches the exact expressions of opcodes. Re-enable when transformation is
-// more general
-TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) {
+TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
   // Build computation with induction variable at tuple element 0.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
@@ -153,18 +149,13 @@
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
-  // Run WhileTransformer.
-  auto result = gpu::CanTransformWhileToFor(while_hlo);
-  TF_ASSERT_OK(result.status());
-  // Check results.
-  EXPECT_THAT(result.ConsumeValueOrDie(),
-              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
+
+  auto result = ComputeWhileLoopTripCount(while_hlo);
+  ASSERT_TRUE(result);
+  EXPECT_EQ(10, *result);
 }
 
-// TODO(b/68830972): The while transformer is far too fragile. It patterns
-// matches the exact expressions of opcodes. Re-enable when transformation is
-// more general
-TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) {
+TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
   // Build computation with induction variable at tuple element 1.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(1, 10));
@@ -173,19 +164,14 @@
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
-  // Run WhileTransformer.
-  auto result = gpu::CanTransformWhileToFor(while_hlo);
-  TF_ASSERT_OK(result.status());
-  // Check results.
-  EXPECT_THAT(result.ConsumeValueOrDie(),
-              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
+
+  auto result = ComputeWhileLoopTripCount(while_hlo);
+  ASSERT_TRUE(result);
+  EXPECT_EQ(10, *result);
 }
 
-// TODO(b/68830972): The while transformer is far too fragile. It patterns
-// matches the exact expressions of opcodes. Re-enable when transformation is
-// more general
-TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) {
-  // Build computation with invalid loop limit.
+TEST_F(WhileTransformerTest, ImpossibleLoopLimit) {
+  // Build computation with an impossible loop limit.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 5));
   auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
@@ -193,17 +179,13 @@
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
-  // Run WhileTransformer.
-  auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_FALSE(result.ok());
-  EXPECT_THAT(result.status().error_message(),
-              HasSubstr("Loop start must be less than loop limit."));
+
+  auto result = ComputeWhileLoopTripCount(while_hlo);
+  ASSERT_TRUE(result);
+  EXPECT_EQ(0, *result);
 }
 
-// TODO(b/68830972): The while transformer is far too fragile. It patterns
-// matches the exact expressions of opcodes. Re-enable when transformation is
-// more general
-TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) {
+TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
   // Build computation with invalid loop increment.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
@@ -212,11 +194,9 @@
   // Run HLO Optimization passes.
   RunFusionPasses();
   RunCopyInsertionPass();
-  // Run WhileTransformer.
-  auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_FALSE(result.ok());
-  EXPECT_THAT(result.status().error_message(),
-              HasSubstr("Loop increment must greater than zero."));
+
+  auto result = ComputeWhileLoopTripCount(while_hlo);
+  ASSERT_FALSE(result);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 0b93d97..fa21865 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,6 +34,7 @@
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
+// Next ID: 51
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -74,6 +75,11 @@
   // Describes the dimension numbers used for a convolution.
   xla.ConvolutionDimensionNumbers convolution_dimension_numbers = 16;
 
+  // The number of feature groups. Used for a convolution. Must be a divisor of
+  // the input feature dimension and output feature dimension. If not specified,
+  // it will use a default value of 1.
+  int64 feature_group_count = 50;
+
   // Describes the [begin, end) index range and stride for slices.
   message SliceDimensions {
     int64 start = 1;
@@ -133,7 +139,7 @@
 
   // Gather dimension numbers.
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
-  repeated int64 gather_window_bounds = 34;
+  repeated int64 gather_slice_sizes = 34;
 
   // Compute Host.
   string channel_name = 41;
@@ -151,8 +157,11 @@
   // Backend configuration for the instruction. Has backend-specific meaning.
   string backend_config = 43;
 
-  // Cross Replica Sum fields.
+  // Cross replica op fields.
+  // TODO(b/112107579): remove replica_group_ids field and always use
+  // replica_groups.
   repeated int64 replica_group_ids = 44;
+  repeated ReplicaGroup replica_groups = 49;
   int64 all_reduce_id = 45;
   string cross_replica_sum_barrier = 46;
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index a2cefd2..1bbb0ff 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -543,6 +543,19 @@
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
+  // TODO(b/110096724): Compute correct cost here.
+  double flops = 0.0;
+  ShapeUtil::ForEachSubshape(hlo->shape(),
+                             [&](const Shape& subshape, const ShapeIndex&) {
+                               if (ShapeUtil::IsArray(subshape)) {
+                                 flops += ShapeUtil::ElementsIn(subshape);
+                               }
+                             });
+  current_properties_[kFlopsKey] = flops;
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 0a79c92..193a04b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -71,6 +71,7 @@
   Status HandleConvolution(const HloInstruction* convolution) override;
   Status HandleFft(const HloInstruction* fft) override;
   Status HandleCrossReplicaSum(const HloInstruction* crs) override;
+  Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
   Status HandleHostCompute(const HloInstruction* host_compute) override;
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 90d2be1..858992a 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -174,6 +174,29 @@
       HloInstruction::CreateDot(dot_shape, lhs, rhs, dim_numbers));
 }
 
+StatusOr<HloInstruction*> MakeMapHlo(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* map_computation) {
+  CHECK(!operands.empty()) << "Map Hlo requires at least one operand.";
+  HloComputation* computation = operands.front()->parent();
+  std::vector<const Shape*> operand_shapes;
+  int64 max_operand_rank = 0;
+  for (const HloInstruction* operand : operands) {
+    CHECK_EQ(computation, operand->parent());
+    operand_shapes.push_back(&operand->shape());
+    max_operand_rank =
+        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+  }
+  std::vector<int64> map_dims(max_operand_rank);
+  std::iota(map_dims.begin(), map_dims.end(), 0);
+  TF_ASSIGN_OR_RETURN(
+      Shape map_shape,
+      ShapeInference::InferMapShape(
+          operand_shapes, map_computation->ComputeProgramShape(), map_dims));
+  return computation->AddInstruction(
+      HloInstruction::CreateMap(map_shape, operands, map_computation));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -251,6 +274,38 @@
   return MakeReshapeHlo(output_shape, operand);
 }
 
+StatusOr<HloInstruction*> InsertDegenerateDims(
+    HloInstruction* operand, ArraySlice<int64> dims_to_insert) {
+  CHECK(c_is_sorted(dims_to_insert));
+
+  const Shape& operand_shape = operand->shape();
+  int64 output_shape_rank =
+      operand_shape.dimensions_size() + dims_to_insert.size();
+  for (auto dim_to_insert : dims_to_insert) {
+    CHECK_LT(dim_to_insert, output_shape_rank);
+  }
+
+  std::vector<int64> output_shape_dim_bounds;
+  output_shape_dim_bounds.reserve(output_shape_rank);
+  int64 operand_dims_idx = 0;
+  int64 dims_to_insert_idx = 0;
+  for (int64 i = 0; i < output_shape_rank; ++i) {
+    if (dims_to_insert_idx < dims_to_insert.size() &&
+        i == dims_to_insert[dims_to_insert_idx]) {
+      output_shape_dim_bounds.push_back(1);
+      ++dims_to_insert_idx;
+    } else {
+      output_shape_dim_bounds.push_back(
+          operand_shape.dimensions(operand_dims_idx));
+      ++operand_dims_idx;
+    }
+  }
+
+  Shape output_shape = ShapeUtil::MakeShape(operand_shape.element_type(),
+                                            output_shape_dim_bounds);
+  return MakeReshapeHlo(output_shape, operand);
+}
+
 StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
                                              int64 zeros_to_prepend,
                                              int64 zeros_to_append) {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 49b1402..5ff8946 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -102,6 +102,12 @@
 StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
                                      const DotDimensionNumbers& dim_numbers);
 
+// Creates a Map HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation.
+StatusOr<HloInstruction*> MakeMapHlo(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* map_computation);
+
 // -----------------------------------------------------------------------------
 // Some other miscellaneous helpers to generate common HLO patterns.  All of
 // these add all the instructions they generate into the computation containing
@@ -144,6 +150,16 @@
 StatusOr<HloInstruction*> ElideDegenerateDims(
     HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dims_to_elide);
 
+// Inserts (via reshape) a set of degenerate dimensions (dimensions containing
+// exactly one element), `dims_to_insert` into `operand`. The dimensions in
+// `dims_to_insert` refer to the dimensions in the result, and hence should be
+// less than the rank of the result. Also, `dims_to_insert` must be sorted.
+//
+// For example, if `operand` is of shape f32[12,21,8,34] and dims_to_insert is
+// {0, 2}, then the result is `operand` reshaped to [1,12,1,21,8,34].
+StatusOr<HloInstruction*> InsertDegenerateDims(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dims_to_insert);
+
 // Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the
 // front and `zeros_to_append` zeros in the back.
 StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index ffc18a0..70271be 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -490,5 +490,38 @@
   ASSERT_TRUE(ParseModule(hlo_string).status().ok());
 }
 
+TEST_F(HloDomainTest, DomainTuple) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = f32[4] parameter(0), sharding={maximal device=0}
+  cst = u32[] constant(0), sharding={maximal device=1}
+  tpl = (u32[], f32[4]) tuple(cst, p0), sharding={{maximal device=1}, {maximal device=0}}
+  ROOT gte = f32[4] get-tuple-element(tpl), index=1, sharding={maximal device=0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  EXPECT_TRUE(isolator_changed);
+
+  // Clear sharding of tpl instruction, in order to test domain sharding
+  // application.
+  auto tpl = FindInstruction(module, "tpl");
+  tpl->clear_sharding();
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           ShardingMetadata::NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  EXPECT_TRUE(remover_changed);
+
+  EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1),
+                                              HloSharding::AssignDevice(0)}),
+            tpl->sharding());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index c804f43..b9244b8 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -144,6 +144,7 @@
           opcode == HloOpcode::kCrossReplicaSum ||
           opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
           opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
+          opcode == HloOpcode::kScatter ||
           opcode == HloOpcode::kSelectAndScatter ||
           opcode == HloOpcode::kConditional) {
         continue;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 51353ee..36d6a2e 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -555,43 +555,39 @@
   return Status::OK();
 }
 
-// Returns an ShapeUtil::IndexIterationSpace that iterates over the output
-// gather dimensions while keeping the rest of the output dimensions clamped to
-// 0.
-ShapeUtil::IndexIterationSpace IterationSpaceForOutputGatherIndices(
+// Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch
+// dimensions while keeping the rest of the output dimensions clamped to 0.
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices(
     const Shape& output_shape, const GatherDimensionNumbers& dim_numbers) {
   int64 output_rank = output_shape.dimensions_size();
   std::vector<int64> index_base(output_rank, 0);
   std::vector<int64> index_count;
   index_count.reserve(output_rank);
   for (int64 i = 0; i < output_rank; i++) {
-    bool is_output_gather_dim =
-        !c_binary_search(dim_numbers.output_window_dims(), i);
-    index_count.push_back(is_output_gather_dim ? output_shape.dimensions(i)
-                                               : 1);
+    bool is_output_batch_dim = !c_binary_search(dim_numbers.offset_dims(), i);
+    index_count.push_back(is_output_batch_dim ? output_shape.dimensions(i) : 1);
   }
 
   return {std::move(index_base), std::move(index_count),
           std::vector<int64>(output_rank, 1)};
 }
 
-// Return an ShapeUtil::IndexIterationSpace that iterates over the output window
+// Return an ShapeUtil::IndexIterationSpace that iterates over the output slice
 // dimensions while keeping the rest of the output dimensions clamped to 0.
-ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices(
-    int64 output_rank, ArraySlice<int64> window_bounds,
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputOffsetIndices(
+    int64 output_rank, ArraySlice<int64> slice_sizes,
     const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> index_base(output_rank, 0);
   std::vector<int64> index_count(output_rank, 1);
-  int64 window_bounds_idx = 0;
+  int64 slice_sizes_idx = 0;
   for (int64 i = 0; i < output_rank; i++) {
-    bool is_output_window_dim =
-        c_binary_search(dim_numbers.output_window_dims(), i);
+    bool is_output_window_dim = c_binary_search(dim_numbers.offset_dims(), i);
     if (is_output_window_dim) {
-      while (c_binary_search(dim_numbers.elided_window_dims(),
-                             window_bounds_idx)) {
-        window_bounds_idx++;
+      while (c_binary_search(dim_numbers.collapsed_slice_dims(),
+                             slice_sizes_idx)) {
+        slice_sizes_idx++;
       }
-      index_count[i] = window_bounds[window_bounds_idx++];
+      index_count[i] = slice_sizes[slice_sizes_idx++];
     }
   }
 
@@ -599,30 +595,30 @@
           std::vector<int64>(output_rank, 1)};
 }
 
-// This functor computes the contribution of gather_indices to an input index
+// This functor computes the contribution of start_indices to an input index
 // corresponding to an output index.  That is, given an output index I, it picks
-// out the gather output indices in I and uses them to look up a gather index,
-// G, from the gather indices tensor, and expands G into the input space
-// according to gather_dims_to_operand_dims.
-class OutputGatherIndexToInputIndex {
+// out the batch indices in I and uses them to look up a starting index, G, from
+// the start indices tensor, and expands G into the input space according to
+// start_index_map.
+class OutputBatchIndexToInputIndex {
  public:
   // The constructor does some setup work that is amortized across all
   // iterations.
-  explicit OutputGatherIndexToInputIndex(
+  explicit OutputBatchIndexToInputIndex(
       const GatherDimensionNumbers* dim_numbers, const Shape& input_shape,
-      const Shape& output_shape, const Literal* gather_indices)
-      : dim_numbers_(*dim_numbers), gather_indices_(*gather_indices) {
+      const Shape& output_shape, const Literal* start_indices)
+      : dim_numbers_(*dim_numbers), start_indices_(*start_indices) {
     for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
-      output_dim_is_gather_dims_.push_back(
-          !c_binary_search(dim_numbers_.output_window_dims(), i));
+      output_dim_is_batch_dims_.push_back(
+          !c_binary_search(dim_numbers_.offset_dims(), i));
     }
 
     for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
       int64 index_of_input_dim_in_index_vector =
-          std::distance(dim_numbers_.gather_dims_to_operand_dims().begin(),
-                        c_find(dim_numbers_.gather_dims_to_operand_dims(), i));
+          std::distance(dim_numbers_.start_index_map().begin(),
+                        c_find(dim_numbers_.start_index_map(), i));
       if (index_of_input_dim_in_index_vector ==
-          dim_numbers_.gather_dims_to_operand_dims_size()) {
+          dim_numbers_.start_index_map_size()) {
         input_dim_value_to_index_vector_.push_back(-1);
       } else {
         input_dim_value_to_index_vector_.push_back(
@@ -630,14 +626,14 @@
       }
     }
 
-    index_vector_index_.resize(gather_indices_.shape().dimensions_size());
+    index_vector_index_.resize(start_indices_.shape().dimensions_size());
     input_index_.resize(input_shape.dimensions_size());
     int64 index_vector_size =
-        gather_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
+        start_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
     index_vector_.resize(index_vector_size);
   }
 
-  // Returns the contribution of gather_indices to the input index corresponding
+  // Returns the contribution of start_indices to the input index corresponding
   // to output_index.  See gather_inner_loop_body.
   //
   // This is conceptually  a stateless transformation from output_index to the
@@ -659,7 +655,7 @@
   }
 
  private:
-  // Propagates the gather index dimensions from the output index into
+  // Propagates the batch dimensions from the output index into
   // index_vector_index_ by mutating index_vector_index_ in place.  Does not
   // update the dim_numbers.index_vector_dim() dimension -- that's the dimension
   // we iterate over in FetchIndexVector.
@@ -667,7 +663,7 @@
       ArraySlice<int64> output_index) {
     int64 index_vector_index_i = 0;
     for (int64 i = 0, e = output_index.size(); i < e; i++) {
-      if (!output_dim_is_gather_dims_[i]) {
+      if (!output_dim_is_batch_dims_[i]) {
         continue;
       }
 
@@ -679,14 +675,14 @@
     }
   }
 
-  // Populates index_vector_ by iterating over gather_indices_ according to
+  // Populates index_vector_ by iterating over start_indices_ according to
   // index_vector_index_.
   Status FetchIndexVector() {
     int64 index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
-      TF_ASSIGN_OR_RETURN(index_vector_[i], gather_indices_.GetIntegralAsS64(
-                                                index_vector_index_));
+      TF_ASSIGN_OR_RETURN(index_vector_[i],
+                          start_indices_.GetIntegralAsS64(index_vector_index_));
     }
     return Status::OK();
   }
@@ -708,15 +704,15 @@
   // PropagateIndexVectorToInputIndex.
   std::vector<int64> input_dim_value_to_index_vector_;
 
-  // output_dim_is_gather_dims_[i] is true iff the output index i is a gather
+  // output_dim_is_batch_dims_[i] is true iff the output index i is a gather
   // dimension.
-  std::vector<bool> output_dim_is_gather_dims_;
+  std::vector<bool> output_dim_is_batch_dims_;
 
-  // The buffer into which we construct an index into gather_indices_ to fetch
+  // The buffer into which we construct an index into start_indices_ to fetch
   // the index vector.
   std::vector<int64> index_vector_index_;
 
-  // The index vector fetched from gather_indices_.
+  // The index vector fetched from start_indices_.
   std::vector<int64> index_vector_;
 
   // The result computed by this functor.  operator() returns an ArraySlice into
@@ -724,24 +720,23 @@
   std::vector<int64> input_index_;
 
   const GatherDimensionNumbers& dim_numbers_;
-  const Literal& gather_indices_;
+  const Literal& start_indices_;
 };
 
-// This functor computes the contribution of the window indices in an output
+// This functor computes the contribution of the offset indices in an output
 // index to an input index.  That is, given an output index I it picks out the
-// output window indices in I and expands it into a window index into the input
-// shape.
-class OutputWindowIndexToInputIndex {
+// output offset indices in I and expands it into an index into the input shape.
+class OutputOffsetIndexToInputIndex {
  public:
   // The constructor does some setup work that is amortized across all
   // iterations.
-  explicit OutputWindowIndexToInputIndex(
+  explicit OutputOffsetIndexToInputIndex(
       const GatherDimensionNumbers& dim_numbers, const Shape& input_shape,
       const Shape& output_shape) {
     std::vector<int64> window_index_to_output_index;
     int64 output_index_count = 0;
     for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
-      if (c_binary_search(dim_numbers.output_window_dims(), i)) {
+      if (c_binary_search(dim_numbers.offset_dims(), i)) {
         window_index_to_output_index.push_back(output_index_count++);
       } else {
         output_index_count++;
@@ -750,7 +745,7 @@
 
     int64 window_dim_count = 0;
     for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
-      if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      if (c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
         input_dim_value_to_output_index_.push_back(-1);
       } else {
         input_dim_value_to_output_index_.push_back(
@@ -808,20 +803,20 @@
 
 // Rehapes the gather indices input to have a trailing degenerate `1` dimension
 // if necessary.  Hands over the ownership of the newly created literal (if
-// there is one) to `reshaped_gather_indices`.
+// there is one) to `reshaped_start_indices`.
 static StatusOr<std::reference_wrapper<const Literal>> ReshapedGatherIndices(
-    int64 index_vector_dim, const Literal& gather_indices,
-    std::unique_ptr<Literal>* reshaped_gather_indices) {
-  if (gather_indices.shape().dimensions_size() != index_vector_dim) {
-    return std::cref(gather_indices);
+    int64 index_vector_dim, const Literal& start_indices,
+    std::unique_ptr<Literal>* reshaped_start_indices) {
+  if (start_indices.shape().dimensions_size() != index_vector_dim) {
+    return std::cref(start_indices);
   }
 
-  std::vector<int64> new_shape(gather_indices.shape().dimensions().begin(),
-                               gather_indices.shape().dimensions().end());
+  std::vector<int64> new_shape(start_indices.shape().dimensions().begin(),
+                               start_indices.shape().dimensions().end());
   new_shape.push_back(1);
-  TF_ASSIGN_OR_RETURN(*reshaped_gather_indices,
-                      gather_indices.Reshape(new_shape));
-  return std::cref(**reshaped_gather_indices);
+  TF_ASSIGN_OR_RETURN(*reshaped_start_indices,
+                      start_indices.Reshape(new_shape));
+  return std::cref(**reshaped_start_indices);
 }
 
 Status HloEvaluator::HandleGather(HloInstruction* gather) {
@@ -830,34 +825,33 @@
   const GatherDimensionNumbers& dim_numbers =
       gather->gather_dimension_numbers();
   const Literal& operand = GetEvaluatedLiteralFor(gather->operand(0));
-  std::unique_ptr<Literal> reshaped_gather_indices;
+  std::unique_ptr<Literal> reshaped_start_indices;
   TF_ASSIGN_OR_RETURN(
-      const Literal& gather_indices,
+      const Literal& start_indices,
       ReshapedGatherIndices(dim_numbers.index_vector_dim(),
                             GetEvaluatedLiteralFor(gather->operand(1)),
-                            &reshaped_gather_indices));
+                            &reshaped_start_indices));
 
   // We iterate over the gather dimensions in the output shape in an outer loop
   // nest, and iterate over the window dimensions in the output shape in an
   // inner loop nest.
 
-  ShapeUtil::IndexIterationSpace gather_indices_iteration_space =
-      IterationSpaceForOutputGatherIndices(shape, dim_numbers);
-  ShapeUtil::IndexIterationSpace window_indices_iteration_space =
-      IterationSpaceForOutputWindowIndices(
-          shape.dimensions_size(), gather->gather_window_bounds(), dim_numbers);
+  ShapeUtil::IndexIterationSpace start_indices_iteration_space =
+      IterationSpaceForOutputBatchIndices(shape, dim_numbers);
+  ShapeUtil::IndexIterationSpace offset_indices_iteration_space =
+      IterationSpaceForOutputOffsetIndices(
+          shape.dimensions_size(), gather->gather_slice_sizes(), dim_numbers);
 
   // Scratch buffers that hold an index in the output shape and the
   // corresponding index in the input shape.
   std::vector<int64> input_index(operand.shape().dimensions_size());
   std::vector<int64> output_index(gather->shape().dimensions_size());
-  std::vector<int64> input_gather_index_clamped(
-      operand.shape().dimensions_size());
+  std::vector<int64> input_index_clamped(operand.shape().dimensions_size());
 
-  OutputGatherIndexToInputIndex output_gather_index_to_input_index(
+  OutputBatchIndexToInputIndex output_batch_index_to_input_index(
       &gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
-      /*output_shape=*/shape, &gather_indices);
-  OutputWindowIndexToInputIndex output_window_index_to_input_index(
+      /*output_shape=*/shape, &start_indices);
+  OutputOffsetIndexToInputIndex output_offset_index_to_input_index(
       gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
       /*output_shape=*/shape);
 
@@ -869,29 +863,29 @@
           ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
     TF_ASSIGN_OR_RETURN(
         ArraySlice<int64> input_window_index,
-        output_window_index_to_input_index(output_window_index));
+        output_offset_index_to_input_index(output_window_index));
     for (int i = 0, e = output_index.size(); i < e; i++) {
       output_index[i] = output_gather_index[i] + output_window_index[i];
       DCHECK_LT(output_index[i], shape.dimensions(i));
     }
     for (int i = 0, e = input_gather_index.size(); i < e; i++) {
       int64 output_dim =
-          output_window_index_to_input_index.input_dim_value_to_output_index(i);
+          output_offset_index_to_input_index.input_dim_value_to_output_index(i);
       // If 'output_dim' is -1, it means 'i' is an elided window dim. This means
       // we set the iteration index to 0, so for the purpose of the following
       // calculations we can consider the output dimension size to be 1.
       int64 output_dim_size =
           output_dim == -1 ? 1 : shape.dimensions(output_dim);
       // Clamp the gather index so that the gather region fits in the operand.
-      // input_gather_index_clamped[i] = clamp(input_gather_index[i], 0,
+      // input_index_clamped[i] = clamp(input_gather_index[i], 0,
       //                                       operand_shape.dimensions(i) -
       //                                       output_dim_size);
-      input_gather_index_clamped[i] =
+      input_index_clamped[i] =
           std::min(operand_shape.dimensions(i) - output_dim_size,
                    std::max(0LL, input_gather_index[i]));
     }
     for (int i = 0, e = input_index.size(); i < e; i++) {
-      input_index[i] = input_gather_index_clamped[i] + input_window_index[i];
+      input_index[i] = input_index_clamped[i] + input_window_index[i];
       DCHECK_GE(input_index[i], 0);
       DCHECK_LT(input_index[i], operand_shape.dimensions(i));
     }
@@ -902,18 +896,17 @@
 
   auto gather_outer_loop_body =
       [&](ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
-    TF_ASSIGN_OR_RETURN(
-        ArraySlice<int64> input_gather_index,
-        output_gather_index_to_input_index(output_gather_index));
+    TF_ASSIGN_OR_RETURN(ArraySlice<int64> input_gather_index,
+                        output_batch_index_to_input_index(output_gather_index));
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-        shape, window_indices_iteration_space,
+        shape, offset_indices_iteration_space,
         std::bind(gather_inner_loop_body, std::placeholders::_1,
                   input_gather_index, output_gather_index)));
     return true;
   };
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      shape, gather_indices_iteration_space, gather_outer_loop_body));
+      shape, start_indices_iteration_space, gather_outer_loop_body));
   evaluated_[gather] = std::move(result);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index cba7246..1394be6 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -1826,21 +1826,20 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
@@ -1851,21 +1850,20 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
@@ -1876,22 +1874,22 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR3<int32>(
           {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
@@ -1902,11 +1900,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   ParseAndVerifyModule(hlo_text);
@@ -1914,11 +1912,11 @@
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{-1, 1}, {-4, 4}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1930,11 +1928,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   ParseAndVerifyModule(hlo_text);
@@ -1942,11 +1940,11 @@
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{-2, 2}, {-1, 1}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
@@ -1957,21 +1955,20 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{5}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
@@ -1982,21 +1979,21 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR3<int32>({{{8}}, {{5}}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
@@ -2007,20 +2004,19 @@
   operand = s32[3,0] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,0] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 0}
+      slice_sizes={1, 0}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{}, {}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
@@ -2031,21 +2027,474 @@
   operand = s32[3] parameter(0)
   indices = s32[2,2,1] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1}
+      slice_sizes={1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
 
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{0, 1}, {2, 1}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{10, 20, 30}, {4, 5, 6}, {70, 80, 90}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 30}, {40, 60}, {70, 90}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{10, 2, 30}, {40, 5, 60}, {70, 8, 90}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{11, 22, 33}, {4, 5, 6}, {77, 88, 99}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=mul_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{10, 40, 90}, {4, 5, 6}, {490, 640, 810}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+add_f32 (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(f32[] lhs, f32[] rhs)
+}
+
+ENTRY main {
+  operand = f32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = f32[2,3] parameter(2)
+  ROOT scatter = f32[3,3] scatter(operand, indices, updates),
+      to_apply=add_f32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<float>(
+      {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({2, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<float>({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}});
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *LiteralUtil::CreateR2<float>(
+          {{1.1, 2.2, 3.3}, {6.7, 8.6, 8.2}, {8.1, 9.9, 10.6}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()}),
+      ErrorSpec{0.1, 0.01}));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{1, 2, 3}, {84, 105, 126}, {7, 8, 9}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterMultipleBatchDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=2
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *LiteralUtil::CreateR2<int32>({{11, 7, 38}, {44, 10, 71}, {77, 13, 104}}),
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNd
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-40, 40}});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR3<int32>({{{-10, 10}, {-2, 2}, {-3, 3}},  //
+                                    {{-40, 40}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *expected,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest,
+       EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-20, 20}});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR3<int32>({{{-20, 20}, {-10, 10}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},      //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *expected,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule DynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0,1},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{10}});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 10, 6}, {7, 8, 9}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *expected,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule BatchDynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR3<int32>({{{10}}, {{20}}});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 20, 6}, {7, 10, 9}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *expected,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter_ZeroDimBounds
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,0] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,0] parameter(2)
+  ROOT scatter = s32[3,0] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{}, {}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *operand,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
+  const string hlo_text = R"(
+HloModule Scatter_NoUpdateWindowDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[2,2,1] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20}, {30, 40}});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR1<int32>({10, 61, 32});
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *expected,
+      *Evaluate({operand.get(), scatter_indices.get(), updates.get()})));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
@@ -2064,6 +2513,31 @@
                std::move(rhs));
 }
 
+TEST_P(HloEvaluatorTest, Bf16Reduction) {
+  const string hlo_text = R"(
+HloModule Bf16Reduction
+
+add_bf16 (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(bf16[] lhs, bf16[] rhs)
+}
+
+ENTRY main {
+  arg0 = bf16[4]{0} parameter(0)
+  init = bf16[] constant(0)
+  ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+
+  std::unique_ptr<Literal> arg = LiteralUtil::CreateR1<bfloat16>(
+      {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR0<bfloat16>(bfloat16(44.0f));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *Evaluate({arg.get()})));
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index d1ee4a1..7fdf452 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -86,6 +86,28 @@
 // of this class.
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
+ private:
+  // Get the value in the given literal static_cast as a double.
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  double GetAsDouble(const Literal& literal,
+                     tensorflow::gtl::ArraySlice<int64> input_index) {
+    return static_cast<double>(literal.Get<NativeT>(input_index));
+  }
+
+  // Specialization for complex types. In this case it is not possible to
+  // static_cast value to a double so just CHECK fail. This method is not used
+  // at run-time, but must be available at compile-time to keep the compiler
+  // happy.
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  double GetAsDouble(const Literal& literal,
+                     tensorflow::gtl::ArraySlice<int64> input_index) {
+    CHECK(false);
+  }
+
  public:
   explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
 
@@ -1473,6 +1495,10 @@
   }
 
   Status HandleReduce(HloInstruction* reduce) override {
+    // TODO(b/112040122): Support variadic reduce.
+    if (!ShapeUtil::IsArray(reduce->shape())) {
+      return Unimplemented("Variadic reduce is not supported in the Evaluator");
+    }
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
     tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
@@ -1532,7 +1558,7 @@
               IsScalarAdd(function)) {
             double computed_result = 0;
             auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
-              computed_result += arg_literal.Get<float>(input_index);
+              computed_result += GetAsDouble<ReturnT>(arg_literal, input_index);
               return true;
             };
             ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
@@ -1771,6 +1797,388 @@
     return Status::OK();
   }
 
+  // Reshapes the scatter indices input to have a trailing degenerate `1`
+  // dimension if necessary.  Hands over the ownership of the newly created
+  // literal (if there is one) to `reshaped_indices`.
+  StatusOr<std::reference_wrapper<const Literal>> ReshapedScatterIndices(
+      int64 index_vector_dim, const Literal& indices,
+      std::unique_ptr<Literal>* reshaped_indices) {
+    if (indices.shape().dimensions_size() != index_vector_dim) {
+      return std::cref(indices);
+    }
+
+    std::vector<int64> new_shape(indices.shape().dimensions().begin(),
+                                 indices.shape().dimensions().end());
+    new_shape.push_back(1);
+    TF_ASSIGN_OR_RETURN(*reshaped_indices, indices.Reshape(new_shape));
+    return std::cref(**reshaped_indices);
+  }
+
+  // Returns an ShapeUtil::IndexIterationSpace that iterates over the update
+  // scatter dimensions while keeping the rest of the update dimensions clamped
+  // to 0.
+  ShapeUtil::IndexIterationSpace IterationSpaceForUpdateScatterIndices(
+      const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) {
+    int64 updates_rank = updates_shape.dimensions_size();
+    std::vector<int64> index_base(updates_rank, 0);
+    std::vector<int64> index_count(updates_rank, 1);
+    for (int64 i = 0; i < updates_rank; i++) {
+      bool is_update_scatter_dim =
+          !c_binary_search(dim_numbers.update_window_dims(), i);
+      if (is_update_scatter_dim) {
+        index_count[i] = updates_shape.dimensions(i);
+      }
+    }
+    return {std::move(index_base), std::move(index_count),
+            std::vector<int64>(updates_rank, 1)};
+  }
+
+  // Return an ShapeUtil::IndexIterationSpace that iterates over the update
+  // window dimensions while keeping the rest of the update dimensions clamped
+  // to 0.
+  ShapeUtil::IndexIterationSpace IterationSpaceForUpdateWindowIndices(
+      const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) {
+    int64 updates_rank = updates_shape.dimensions_size();
+    std::vector<int64> index_base(updates_rank, 0);
+    std::vector<int64> index_count(updates_rank, 1);
+    for (int64 i = 0; i < updates_rank; i++) {
+      bool is_update_window_dim =
+          c_binary_search(dim_numbers.update_window_dims(), i);
+      if (is_update_window_dim) {
+        index_count[i] = updates_shape.dimensions(i);
+      }
+    }
+    return {std::move(index_base), std::move(index_count),
+            std::vector<int64>(updates_rank, 1)};
+  }
+
+  // This functor computes the contribution of scatter_indices to an input index
+  // corresponding to an update index.  That is, given an update index I, it
+  // picks out the scatter indices in I and uses them to look up a scatter
+  // index, S, from the scatter indices tensor, and expands S into the input
+  // space according to scatter_dims_to_operand_dims.
+  //
+  // This is similar to the class HloEvaluator::OutputGatherIndexToInputIndex
+  // that does the corresponding function for Gather.
+  class UpdateScatterIndexToInputIndex {
+   public:
+    // The constructor does some setup work that is amortized across all
+    // iterations.
+    explicit UpdateScatterIndexToInputIndex(
+        const ScatterDimensionNumbers* dim_numbers, const Shape& input_shape,
+        const Shape& updates_shape, const Literal* scatter_indices)
+        : dim_numbers_(*dim_numbers), scatter_indices_(*scatter_indices) {
+      for (int64 i = 0; i < updates_shape.dimensions_size(); i++) {
+        update_dim_is_scatter_dims_.push_back(
+            !c_binary_search(dim_numbers_.update_window_dims(), i));
+      }
+
+      for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
+        int64 index_of_input_dim_in_index_vector =
+            FindIndex(dim_numbers_.scatter_dims_to_operand_dims(), i);
+        if (index_of_input_dim_in_index_vector ==
+            dim_numbers_.scatter_dims_to_operand_dims_size()) {
+          input_dim_value_to_index_vector_.push_back(-1);
+        } else {
+          input_dim_value_to_index_vector_.push_back(
+              index_of_input_dim_in_index_vector);
+        }
+      }
+
+      index_vector_index_.resize(scatter_indices_.shape().dimensions_size());
+      input_index_.resize(input_shape.dimensions_size());
+      int64 index_vector_size =
+          scatter_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
+      index_vector_.resize(index_vector_size);
+    }
+
+    // Returns the contribution of scatter_indices to the input index
+    // corresponding to update_index.  See scatter_inner_loop_body.
+    //
+    // This is conceptually  a stateless transformation from update_index to the
+    // scatter input index, but:
+    //
+    //  - Instead of allocating memory to represent the scatter input index on
+    //    every invocation we reuse the same storage for the result
+    //    (input_index_), mutating it in place.
+    //  - Instead of allocating buffers for temporary values like
+    //    index_vector_index_ and index_vector on every invocation, we reuse the
+    //    same storage for all invocations.
+    //
+    // This returns an arrayslice into memory owned by the class.
+    StatusOr<tensorflow::gtl::ArraySlice<int64>> operator()(
+        tensorflow::gtl::ArraySlice<int64> update_index) {
+      PropagateUpdateIndexScatterDimsToIndexVectorIndex(update_index);
+      TF_RETURN_IF_ERROR(FetchIndexVector());
+      PropagateIndexVectorToInputIndex();
+      return tensorflow::gtl::ArraySlice<int64>(input_index_);
+    }
+
+   private:
+    // Propagates the scatter index dimensions from the update index into
+    // index_vector_index_ by mutating index_vector_index_ in place.  Does not
+    // update the dim_numbers.index_vector_dim() dimension -- that's the
+    // dimension we iterate over in FetchIndexVector.
+    void PropagateUpdateIndexScatterDimsToIndexVectorIndex(
+        tensorflow::gtl::ArraySlice<int64> update_index) {
+      int64 index_vector_index_i = 0;
+      for (int64 i = 0, e = update_index.size(); i < e; i++) {
+        if (!update_dim_is_scatter_dims_[i]) {
+          continue;
+        }
+
+        if (index_vector_index_i == dim_numbers_.index_vector_dim()) {
+          index_vector_index_i++;
+        }
+
+        index_vector_index_[index_vector_index_i++] = update_index[i];
+      }
+    }
+
+    // Populates index_vector_ by iterating over scatter_indices_ according to
+    // index_vector_index_.
+    Status FetchIndexVector() {
+      int64 index_vector_dim = dim_numbers_.index_vector_dim();
+      for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
+        index_vector_index_[index_vector_dim] = i;
+        TF_ASSIGN_OR_RETURN(index_vector_[i], scatter_indices_.GetIntegralAsS64(
+                                                  index_vector_index_));
+      }
+      return Status::OK();
+    }
+
+    // Populates input_index_.
+    void PropagateIndexVectorToInputIndex() {
+      for (int64 i = 0, e = input_index_.size(); i < e; i++) {
+        if (input_dim_value_to_index_vector_[i] != -1) {
+          input_index_[i] = index_vector_[input_dim_value_to_index_vector_[i]];
+        }
+
+        // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i]
+        // remains 0, as set by the constructor.
+      }
+    }
+
+    // input_dim_value_to_index_vector_[i] tells us how to compute dimension i
+    // of the input index from the index vector.  See
+    // PropagateIndexVectorToInputIndex.
+    std::vector<int64> input_dim_value_to_index_vector_;
+
+    // update_dim_is_scatter_dims_[i] is true iff the update index i is a
+    // scatter dimension.
+    std::vector<bool> update_dim_is_scatter_dims_;
+
+    // The buffer into which we construct an index into scatter_indices_ to
+    // fetch the index vector.
+    std::vector<int64> index_vector_index_;
+
+    // The index vector fetched from scatter_indices_.
+    std::vector<int64> index_vector_;
+
+    // The result computed by this functor.  operator() returns an ArraySlice
+    // into this vector.
+    std::vector<int64> input_index_;
+
+    const ScatterDimensionNumbers& dim_numbers_;
+    const Literal& scatter_indices_;
+  };
+
+  // This functor computes the contribution of the window indices in an update
+  // index to an input index.  That is, given an update index I it picks out the
+  // update window indices in I and expands it into a window index into the
+  // input shape.
+  //
+  // This is similar to the class HloEvaluator::OutputWindowIndexToInputIndex
+  // that does the corresponding function for Gather.
+  class UpdateWindowIndexToInputIndex {
+   public:
+    // The constructor does some setup work that is amortized across all
+    // iterations.
+    explicit UpdateWindowIndexToInputIndex(
+        const ScatterDimensionNumbers& dim_numbers, const Shape& input_shape,
+        const Shape& updates_shape) {
+      std::vector<int64> window_index_to_update_index;
+      int64 update_index_count = 0;
+      for (int64 i = 0; i < updates_shape.dimensions_size(); i++) {
+        if (c_binary_search(dim_numbers.update_window_dims(), i)) {
+          window_index_to_update_index.push_back(update_index_count++);
+        } else {
+          update_index_count++;
+        }
+      }
+
+      int64 window_dim_count = 0;
+      for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
+        if (c_binary_search(dim_numbers.inserted_window_dims(), i)) {
+          input_dim_value_to_update_index_.push_back(-1);
+        } else {
+          input_dim_value_to_update_index_.push_back(
+              window_index_to_update_index[window_dim_count++]);
+        }
+      }
+
+      input_index_.resize(input_shape.dimensions_size());
+    }
+
+    // Returns the contribution of the window indices to the input index
+    // corresponding to update_index.  See scatter_inner_loop_body.
+    //
+    // This is conceptually a stateless transformation from update_index to the
+    // window input index, but instead of allocating memory to represent the
+    // scatter input index on every invocation we reuse the same storage for the
+    // result (input_index_), mutating it in place.
+    //
+    // This returns an arrayslice into memory owned by the class.
+    StatusOr<tensorflow::gtl::ArraySlice<int64>> operator()(
+        tensorflow::gtl::ArraySlice<int64> update_index) {
+      PropagateUpdateIndexWindowDimsToInputIndex(update_index);
+      return tensorflow::gtl::ArraySlice<int64>(input_index_);
+    }
+
+    // Returns for a given 'input_dim' the corresponding update dimension index,
+    // or -1 if 'input_dim' is an elided window dimension.
+    int64 input_dim_value_to_update_index(int64 input_dim) {
+      return input_dim_value_to_update_index_[input_dim];
+    }
+
+   private:
+    // Propagates window dimensions from the update index to input_index_ by
+    // mutating input_index_ in place.
+    void PropagateUpdateIndexWindowDimsToInputIndex(
+        tensorflow::gtl::ArraySlice<int64> update_index) {
+      for (int64 i = 0, e = input_index_.size(); i < e; i++) {
+        if (input_dim_value_to_update_index_[i] != -1) {
+          input_index_[i] = update_index[input_dim_value_to_update_index_[i]];
+        }
+
+        // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i]
+        // remains 0, as set by the constructor.
+      }
+    }
+
+    // input_dim_value_to_index_vector_[i] tells us how to compute dimension i
+    // of the input index from the update index. See
+    // PropagateUpdateIndexWindowDimsToInputIndex.
+    std::vector<int64> input_dim_value_to_update_index_;
+
+    // The result computed by this functor.  operator() returns an ArraySlice
+    // into this vector.
+    std::vector<int64> input_index_;
+  };
+
+  Status HandleScatter(HloInstruction* scatter) override {
+    const ScatterDimensionNumbers& dim_numbers =
+        scatter->scatter_dimension_numbers();
+    const Literal& operand =
+        parent_->GetEvaluatedLiteralFor(scatter->operand(0));
+    std::unique_ptr<Literal> reshaped_scatter_indices;
+    TF_ASSIGN_OR_RETURN(const Literal& scatter_indices,
+                        ReshapedScatterIndices(dim_numbers.index_vector_dim(),
+                                               parent_->GetEvaluatedLiteralFor(
+                                                   scatter->operand(1)),
+                                               &reshaped_scatter_indices));
+    const Literal& updates =
+        parent_->GetEvaluatedLiteralFor(scatter->operand(2));
+    const Shape& updates_shape = updates.shape();
+    const Shape& operand_shape = operand.shape();
+
+    ShapeUtil::IndexIterationSpace scatter_indices_iteration_space =
+        IterationSpaceForUpdateScatterIndices(updates_shape, dim_numbers);
+    ShapeUtil::IndexIterationSpace window_indices_iteration_space =
+        IterationSpaceForUpdateWindowIndices(updates_shape, dim_numbers);
+
+    std::vector<int64> input_index(operand_shape.dimensions_size());
+    std::vector<int64> update_index(updates_shape.dimensions_size());
+    std::vector<int64> input_scatter_index_clamped(
+        operand_shape.dimensions_size());
+
+    UpdateScatterIndexToInputIndex update_scatter_index_to_input_index(
+        &scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape,
+        updates_shape, &scatter_indices);
+    UpdateWindowIndexToInputIndex update_window_index_to_input_index(
+        scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape,
+        updates_shape);
+
+    // Initialize the result with the operand. This makes it easier to handle
+    // the updates even when the indices are repeated.
+    std::unique_ptr<Literal> result = operand.CloneToUnique();
+    HloEvaluator embedded_evaluator;
+    auto scatter_inner_loop_body =
+        [&](tensorflow::gtl::ArraySlice<int64> update_window_index,
+            tensorflow::gtl::ArraySlice<int64> input_scatter_index,
+            tensorflow::gtl::ArraySlice<int64> update_scatter_index)
+        -> StatusOr<bool> {
+      TF_ASSIGN_OR_RETURN(
+          tensorflow::gtl::ArraySlice<int64> input_window_index,
+          update_window_index_to_input_index(update_window_index));
+      for (int i = 0, e = update_index.size(); i < e; i++) {
+        update_index[i] = update_scatter_index[i] + update_window_index[i];
+        DCHECK_LT(update_index[i], updates_shape.dimensions(i));
+      }
+      for (int i = 0, e = input_scatter_index.size(); i < e; i++) {
+        int64 update_dim =
+            update_window_index_to_input_index.input_dim_value_to_update_index(
+                i);
+        // If 'update_dim' is -1, it means 'i' is an elided window dim. This
+        // means we set the iteration index to 0, so for the purpose of the
+        // following calculations we can consider the update dimension size to
+        // be 1.
+        int64 update_dim_size =
+            update_dim == -1 ? 1 : updates_shape.dimensions(update_dim);
+        // Clamp the scatter index so that the scatter region fits in the
+        // operand. input_scatter_index_clamped[i] =
+        // clamp(input_scatter_index[i], 0,
+        //                                       operand_shape.dimensions(i) -
+        //                                       update_dim_size);
+        input_scatter_index_clamped[i] =
+            std::min(operand_shape.dimensions(i) - update_dim_size,
+                     std::max(0LL, input_scatter_index[i]));
+      }
+      for (int i = 0, e = input_index.size(); i < e; i++) {
+        input_index[i] = input_scatter_index_clamped[i] + input_window_index[i];
+        DCHECK_GE(input_index[i], 0);
+        DCHECK_LT(input_index[i], operand_shape.dimensions(i));
+      }
+
+      auto result_value_literal =
+          LiteralUtil::CreateR0<ReturnT>(result->Get<ReturnT>(input_index));
+      auto update_value_literal =
+          LiteralUtil::CreateR0<ReturnT>(updates.Get<ReturnT>(update_index));
+      std::unique_ptr<Literal> updated_result =
+          embedded_evaluator
+              .Evaluate<const Literal*>(
+                  *scatter->to_apply(),
+                  {result_value_literal.get(), update_value_literal.get()})
+              .ConsumeValueOrDie();
+      // Clear visit states so that the we can use the evaluate again on the
+      // same computation.
+      embedded_evaluator.ResetVisitStates();
+      result->Set<ReturnT>(input_index, updated_result->Get<ReturnT>({}));
+      return true;
+    };
+
+    auto scatter_outer_loop_body =
+        [&](tensorflow::gtl::ArraySlice<int64> update_scatter_index)
+        -> StatusOr<bool> {
+      TF_ASSIGN_OR_RETURN(
+          tensorflow::gtl::ArraySlice<int64> input_scatter_index,
+          update_scatter_index_to_input_index(update_scatter_index));
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
+          updates_shape, window_indices_iteration_space,
+          [&](tensorflow::gtl::ArraySlice<int64> update_window_index) {
+            return scatter_inner_loop_body(
+                update_window_index, input_scatter_index, update_scatter_index);
+          }));
+      return true;
+    };
+
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
+        updates_shape, scatter_indices_iteration_space,
+        scatter_outer_loop_body));
+    parent_->evaluated_[scatter] = std::move(result);
+    return Status::OK();
+  }
+
   Status HandleSlice(HloInstruction* slice) override {
     auto operand = slice->operand(0);
     const Shape& shape = slice->shape();
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index bfe83ca..1efa6eb 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1048,6 +1048,7 @@
     case HloOpcode::kMap:
       return kGray;
     case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllToAll:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kRecv:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 7591b99..57e75cf 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -281,27 +281,14 @@
     case HloOpcode::kInfeed: {
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(proto.shape(), 0);
-      if (proto.operand_ids_size() == 0) {
-        // TODO(b/80000000): Remove this when all uses of infeed are
-        // converted to take tokens.
-        instruction = CreateInfeed(data_shape, proto.infeed_config());
-      } else {
-        CHECK_EQ(proto.operand_ids_size(), 1);
-        instruction =
-            CreateInfeed(data_shape, operands(0), proto.infeed_config());
-      }
+      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      instruction =
+          CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
     case HloOpcode::kOutfeed:
-      if (proto.operand_ids_size() == 1) {
-        // TODO(b/80000000): Remove this when all uses of outfeed are
-        // converted to take tokens.
-        instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                    proto.outfeed_config());
-      } else {
-        CHECK_EQ(proto.operand_ids_size(), 2);
-        instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                    operands(1), proto.outfeed_config());
-      }
+      TF_RET_CHECK(proto.operand_ids_size() == 2);
+      instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
+                                  operands(1), proto.outfeed_config());
       break;
     case HloOpcode::kCrossReplicaSum: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
@@ -320,15 +307,25 @@
           /*all_reduce_id=*/all_reduce_id);
       break;
     }
+    case HloOpcode::kAllToAll: {
+      instruction = CreateAllToAll(
+          proto.shape(), all_operands(),
+          /*replica_groups=*/
+          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
+                                    proto.replica_groups().end()),
+          /*barrier=*/proto.cross_replica_sum_barrier());
+      break;
+    }
     case HloOpcode::kConvolution:
       TF_RET_CHECK(proto.operand_ids_size() == 2)
           << "Convolution instruction should have 2 operands but sees "
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
-      instruction =
-          CreateConvolve(proto.shape(), operands(0), operands(1),
-                         proto.window(), proto.convolution_dimension_numbers());
+      instruction = CreateConvolve(
+          proto.shape(), operands(0), operands(1), proto.window(),
+          proto.convolution_dimension_numbers(),
+          std::max(static_cast<int64>(proto.feature_group_count()), 1LL));
       break;
     case HloOpcode::kReduceWindow:
       TF_RET_CHECK(proto.operand_ids_size() == 2)
@@ -395,13 +392,12 @@
           << "Gather instruction should have GatherDimensionNumbers set.";
       std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers =
           MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
-      std::vector<int64> gather_window_bounds;
-      for (int64 bound : proto.gather_window_bounds()) {
-        gather_window_bounds.push_back(bound);
+      std::vector<int64> gather_slice_sizes;
+      for (int64 bound : proto.gather_slice_sizes()) {
+        gather_slice_sizes.push_back(bound);
       }
-      instruction =
-          CreateGather(proto.shape(), operands(0), operands(1),
-                       *gather_dimension_numbers, gather_window_bounds);
+      instruction = CreateGather(proto.shape(), operands(0), operands(1),
+                                 *gather_dimension_numbers, gather_slice_sizes);
       break;
     }
     case HloOpcode::kScatter: {
@@ -613,10 +609,10 @@
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvolve(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    const Window& window,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return MakeUnique<HloConvolutionInstruction>(shape, lhs, rhs, window,
-                                               dimension_numbers);
+    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
+  return MakeUnique<HloConvolutionInstruction>(
+      shape, lhs, rhs, window, dimension_numbers, feature_group_count);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFft(
@@ -671,17 +667,20 @@
       all_reduce_id);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    const std::vector<ReplicaGroup>& replica_groups,
+    tensorflow::StringPiece barrier) {
+  return MakeUnique<HloAllToAllInstruction>(shape, operands, replica_groups,
+                                            barrier);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
   return MakeUnique<HloInfeedInstruction>(infeed_shape, token_operand, config);
 }
 
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
-    const Shape& infeed_shape, const string& config) {
-  return MakeUnique<HloInfeedInstruction>(infeed_shape, config);
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateOutfeed(
     const Shape& outfeed_shape, HloInstruction* operand,
     HloInstruction* token_operand, tensorflow::StringPiece outfeed_config) {
@@ -689,13 +688,6 @@
                                            token_operand, outfeed_config);
 }
 
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateOutfeed(
-    const Shape& outfeed_shape, HloInstruction* operand,
-    tensorflow::StringPiece outfeed_config) {
-  return MakeUnique<HloOutfeedInstruction>(outfeed_shape, operand,
-                                           outfeed_config);
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSend(
     HloInstruction* operand, HloInstruction* token, int64 channel_id,
     bool is_host_transfer) {
@@ -1085,11 +1077,11 @@
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateGather(
-    const Shape& shape, HloInstruction* operand, HloInstruction* gather_indices,
+    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return MakeUnique<HloGatherInstruction>(shape, operand, gather_indices,
-                                          gather_dim_numbers, window_bounds);
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return MakeUnique<HloGatherInstruction>(shape, operand, start_indices,
+                                          gather_dim_numbers, slice_sizes);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateScatter(
@@ -1153,6 +1145,7 @@
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kReducePrecision:
     case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllToAll:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kConvolution:
@@ -1620,6 +1613,7 @@
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllToAll:
     case HloOpcode::kConvolution:
     case HloOpcode::kCustomCall:
     case HloOpcode::kReduceWindow:
@@ -2265,6 +2259,8 @@
       return visitor->HandleFft(this);
     case HloOpcode::kCrossReplicaSum:
       return visitor->HandleCrossReplicaSum(this);
+    case HloOpcode::kAllToAll:
+      return visitor->HandleAllToAll(this);
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this);
     case HloOpcode::kMap:
@@ -3139,12 +3135,23 @@
   return Cast<HloAllReduceInstruction>(this)->replica_group_ids();
 }
 
+const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
+  return Cast<HloAllToAllInstruction>(this)->replica_groups();
+}
+
 string HloInstruction::cross_replica_sum_barrier() const {
-  return Cast<HloAllReduceInstruction>(this)->cross_replica_sum_barrier();
+  if (opcode() == HloOpcode::kCrossReplicaSum) {
+    return Cast<HloAllReduceInstruction>(this)->cross_replica_sum_barrier();
+  }
+  return Cast<HloAllToAllInstruction>(this)->cross_replica_sum_barrier();
 }
 
 void HloInstruction::set_cross_replica_sum_barrier(const string& barrier) {
-  return Cast<HloAllReduceInstruction>(this)->set_cross_replica_sum_barrier(
+  if (opcode() == HloOpcode::kCrossReplicaSum) {
+    return Cast<HloAllReduceInstruction>(this)->set_cross_replica_sum_barrier(
+        barrier);
+  }
+  return Cast<HloAllToAllInstruction>(this)->set_cross_replica_sum_barrier(
       barrier);
 }
 
@@ -3174,6 +3181,10 @@
   }
 }
 
+int64 HloInstruction::feature_group_count() const {
+  return Cast<HloConvolutionInstruction>(this)->feature_group_count();
+}
+
 HloComputation* HloInstruction::select() const {
   return Cast<HloSelectAndScatterInstruction>(this)->select();
 }
@@ -3214,9 +3225,8 @@
   return Cast<HloGatherInstruction>(this)->gather_dimension_numbers();
 }
 
-tensorflow::gtl::ArraySlice<int64> HloInstruction::gather_window_bounds()
-    const {
-  return Cast<HloGatherInstruction>(this)->gather_window_bounds();
+tensorflow::gtl::ArraySlice<int64> HloInstruction::gather_slice_sizes() const {
+  return Cast<HloGatherInstruction>(this)->gather_slice_sizes();
 }
 
 const ScatterDimensionNumbers& HloInstruction::scatter_dimension_numbers()
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e722086..8d8f149 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -402,7 +402,8 @@
   static std::unique_ptr<HloInstruction> CreateConvolve(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
       const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Creates an FFT op, of the type indicated by fft_type.
   static std::unique_ptr<HloInstruction> CreateFft(
@@ -449,6 +450,26 @@
       tensorflow::StringPiece barrier,
       const tensorflow::gtl::optional<int64>& all_reduce_id);
 
+  // This op handles the communication of an Alltoall operation. On each core,
+  // the operands are N ops in the same shape, where N is the number of cores
+  // participating the Alltoall. Then the N operands are scattered to N cores,
+  // e.g., the ith operand is sent to the ith core. Then each core gathers the
+  // received data into a tuple.
+  //
+  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
+  // empty, all replicas belong to one group in the order of 0 - (n-1). Alltoall
+  // will be applied within subgroups in the specified order. For example,
+  // replica groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied
+  // within replica 1, 2, 3, and in the gather phase, the received blocks will
+  // be concatenated in the order of 1, 2, 3; another Alltoall will be applied
+  // within replica 4, 5, 0, and the concatenation order is 4, 5, 0.
+  //
+  // TODO(b/110096724): This is NOT YET ready to use.
+  static std::unique_ptr<HloInstruction> CreateAllToAll(
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      const std::vector<ReplicaGroup>& replica_groups,
+      tensorflow::StringPiece barrier);
+
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
@@ -466,11 +487,6 @@
   static std::unique_ptr<HloInstruction> CreateInfeed(
       const Shape& infeed_shape, HloInstruction* token_operand,
       const string& config);
-  // Overload which does not require a token.
-  // TODO(b/80000000): Remove this overload when all uses of infeed are
-  // converted to take tokens.
-  static std::unique_ptr<HloInstruction> CreateInfeed(const Shape& infeed_shape,
-                                                      const string& config);
 
   // Creates an outfeed instruction, which outputs data. outfeed_shape is the
   // shape of the data being outfed *not* the shape of the outfeed instruction
@@ -478,12 +494,6 @@
   static std::unique_ptr<HloInstruction> CreateOutfeed(
       const Shape& outfeed_shape, HloInstruction* operand,
       HloInstruction* token_operand, tensorflow::StringPiece outfeed_config);
-  // Overload which does not require a token.
-  // TODO(b/80000000): Remove this overload when all uses of outfeed are
-  // converted to take tokens.
-  static std::unique_ptr<HloInstruction> CreateOutfeed(
-      const Shape& outfeed_shape, HloInstruction* operand,
-      tensorflow::StringPiece outfeed_config);
 
   // Creates an asynchronous send instruction with the given channel id, which
   // initiates sending the operand data to a unique receive instruction in
@@ -657,9 +667,9 @@
 
   static std::unique_ptr<HloInstruction> CreateGather(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* gather_indices,
+      HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   static std::unique_ptr<HloInstruction> CreateScatter(
       const Shape& shape, HloInstruction* operand,
@@ -1414,6 +1424,9 @@
   // Delegates to HloAllReduceInstruction::replica_group_ids.
   const std::vector<int64>& replica_group_ids() const;
 
+  // Delegates to HloAllToAllInstruction::replica_groups.
+  const std::vector<ReplicaGroup>& replica_groups() const;
+
   // Delegates to HloAllReduceInstruction::cross_replica_sum_barrier.
   string cross_replica_sum_barrier() const;
   void set_cross_replica_sum_barrier(const string& barrier);
@@ -1443,6 +1456,10 @@
   void set_convolution_dimension_numbers(
       const ConvolutionDimensionNumbers& dnums);
 
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count() const;
+
   // Delegates to HloSelectAndScatterInstruction::select.
   HloComputation* select() const;
 
@@ -1472,8 +1489,8 @@
 
   // Delegates to HloGatherInstruction::gather_dimension_numbers.
   const GatherDimensionNumbers& gather_dimension_numbers() const;
-  // Delegates to HloGatherInstruction::gather_window_bounds.
-  tensorflow::gtl::ArraySlice<int64> gather_window_bounds() const;
+  // Delegates to HloGatherInstruction::gather_slice_sizes.
+  tensorflow::gtl::ArraySlice<int64> gather_slice_sizes() const;
 
   // Delegates to HloScatterInstruction::scatter_dimension_numbers().
   const ScatterDimensionNumbers& scatter_dimension_numbers() const;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8a694dd..504b130 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1355,7 +1355,7 @@
 
 TEST_F(HloInstructionTest, StringifyGather_0) {
   Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
-  Shape gather_indices_tensor_shape =
+  Shape start_indices_tensor_shape =
       ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 5});
   Shape gather_result_shape =
       ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26});
@@ -1363,19 +1363,18 @@
   HloComputation::Builder builder("Gather");
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor"));
-  HloInstruction* gather_indices =
+  HloInstruction* start_indices =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          1, gather_indices_tensor_shape, "gather_indices"));
+          1, start_indices_tensor_shape, "start_indices"));
 
-  HloInstruction* gather_instruction =
-      builder.AddInstruction(HloInstruction::CreateGather(
-          gather_result_shape, input, gather_indices,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/4),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+  HloInstruction* gather_instruction = builder.AddInstruction(
+      HloInstruction::CreateGather(gather_result_shape, input, start_indices,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{4, 5, 6, 7, 8},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0, 1, 2, 3, 4},
+                                       /*index_vector_dim=*/4),
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
@@ -1383,15 +1382,15 @@
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
             "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), "
-            "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
-            "gather_dims_to_operand_dims={0,1,2,3,4}, "
-            "index_vector_dim=4, window_bounds={30,29,28,27,26}");
+            "s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), "
+            "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
+            "start_index_map={0,1,2,3,4}, "
+            "index_vector_dim=4, slice_sizes={30,29,28,27,26}");
 }
 
 TEST_F(HloInstructionTest, StringifyGather_1) {
   Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
-  Shape gather_indices_tensor_shape =
+  Shape start_indices_tensor_shape =
       ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6});
   Shape gather_result_shape =
       ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26});
@@ -1399,19 +1398,18 @@
   HloComputation::Builder builder("Gather");
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor"));
-  HloInstruction* gather_indices =
+  HloInstruction* start_indices =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          1, gather_indices_tensor_shape, "gather_indices"));
+          1, start_indices_tensor_shape, "start_indices"));
 
-  HloInstruction* gather_instruction =
-      builder.AddInstruction(HloInstruction::CreateGather(
-          gather_result_shape, input, gather_indices,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/2),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+  HloInstruction* gather_instruction = builder.AddInstruction(
+      HloInstruction::CreateGather(gather_result_shape, input, start_indices,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{4, 5, 6, 7, 8},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0, 1, 2, 3, 4},
+                                       /*index_vector_dim=*/2),
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
@@ -1419,10 +1417,10 @@
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
             "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,5,7,6]{4,3,2,1,0} %gather_indices), "
-            "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
-            "gather_dims_to_operand_dims={0,1,2,3,4}, "
-            "index_vector_dim=2, window_bounds={30,29,28,27,26}");
+            "s64[10,9,5,7,6]{4,3,2,1,0} %start_indices), "
+            "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
+            "start_index_map={0,1,2,3,4}, "
+            "index_vector_dim=2, slice_sizes={30,29,28,27,26}");
 }
 
 TEST_F(HloInstructionTest, StringifyScatter) {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1d71a74..4fdf436 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -359,6 +359,67 @@
       cross_replica_sum_barrier(), all_reduce_id());
 }
 
+HloAllToAllInstruction::HloAllToAllInstruction(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    const std::vector<ReplicaGroup>& replica_groups,
+    tensorflow::StringPiece barrier)
+    : HloInstruction(HloOpcode::kAllToAll, shape),
+      replica_groups_(replica_groups),
+      cross_replica_sum_barrier_(barrier.begin(), barrier.end()) {
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+}
+
+bool HloAllToAllInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloAllToAllInstruction&>(other);
+  return ContainersEqual(replica_groups(), casted_other.replica_groups(),
+                         [](const ReplicaGroup& a, const ReplicaGroup& b) {
+                           return ContainersEqual(a.replica_ids(),
+                                                  b.replica_ids());
+                         }) &&
+         cross_replica_sum_barrier() ==
+             casted_other.cross_replica_sum_barrier();
+}
+
+std::unique_ptr<HloInstruction>
+HloAllToAllInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* /*context*/) const {
+  return MakeUnique<HloAllToAllInstruction>(
+      shape, new_operands, replica_groups(), cross_replica_sum_barrier());
+}
+
+std::vector<string> HloAllToAllInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result;
+  std::vector<string> replica_group_str;
+  for (const ReplicaGroup& group : replica_groups()) {
+    replica_group_str.push_back(
+        StrCat("{", Join(group.replica_ids(), ","), "}"));
+  }
+  result.push_back(
+      StrCat("replica_groups={", Join(replica_group_str, ","), "}"));
+
+  if (!cross_replica_sum_barrier().empty()) {
+    result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\""));
+  }
+
+  return result;
+}
+
+HloInstructionProto HloAllToAllInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_replica_groups() = {replica_groups_.begin(),
+                                     replica_groups_.end()};
+  proto.set_cross_replica_sum_barrier(cross_replica_sum_barrier_);
+  return proto;
+}
+
 HloReverseInstruction::HloReverseInstruction(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions)
@@ -1467,13 +1528,6 @@
   AppendOperand(token_operand);
 }
 
-HloInfeedInstruction::HloInfeedInstruction(const Shape& infeed_shape,
-                                           const string& config)
-    : HloInstruction(HloOpcode::kInfeed,
-                     ShapeUtil::MakeTupleShape(
-                         {infeed_shape, ShapeUtil::MakeTokenShape()})),
-      infeed_config_(config) {}
-
 HloInstructionProto HloInfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_infeed_config(infeed_config_);
@@ -1500,13 +1554,9 @@
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  if (new_operands.empty()) {
-    return MakeUnique<HloInfeedInstruction>(infeed_shape(), infeed_config());
-  } else {
-    CHECK_EQ(new_operands.size(), 1);
-    return MakeUnique<HloInfeedInstruction>(infeed_shape(), new_operands[0],
-                                            infeed_config());
-  }
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloInfeedInstruction>(infeed_shape(), new_operands[0],
+                                          infeed_config());
 }
 
 HloOutfeedInstruction::HloOutfeedInstruction(
@@ -1522,18 +1572,6 @@
   AppendOperand(token_operand);
 }
 
-HloOutfeedInstruction::HloOutfeedInstruction(
-    const Shape& outfeed_shape, HloInstruction* operand,
-    tensorflow::StringPiece outfeed_config)
-    : HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeTokenShape()),
-      outfeed_shape_(outfeed_shape),
-      outfeed_config_(outfeed_config.begin(), outfeed_config.end()) {
-  CHECK(ShapeUtil::Compatible(operand->shape(), outfeed_shape))
-      << "Outfeed shape " << outfeed_shape
-      << " must be compatible with operand shape " << operand->shape();
-  AppendOperand(operand);
-}
-
 HloInstructionProto HloOutfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_outfeed_config(outfeed_config());
@@ -1561,22 +1599,19 @@
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  if (new_operands.size() == 1) {
-    return MakeUnique<HloOutfeedInstruction>(outfeed_shape(), new_operands[0],
-                                             outfeed_config());
-  } else {
-    CHECK_EQ(new_operands.size(), 2);
-    return MakeUnique<HloOutfeedInstruction>(outfeed_shape(), new_operands[0],
-                                             new_operands[1], outfeed_config());
-  }
+  CHECK_EQ(new_operands.size(), 2);
+  return MakeUnique<HloOutfeedInstruction>(outfeed_shape(), new_operands[0],
+                                           new_operands[1], outfeed_config());
 }
 
 HloConvolutionInstruction::HloConvolutionInstruction(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers)
+    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count)
     : HloInstruction(HloOpcode::kConvolution, shape),
       window_(window),
-      convolution_dimension_numbers_(dimension_numbers) {
+      convolution_dimension_numbers_(dimension_numbers),
+      feature_group_count_(feature_group_count) {
   if (window_util::HasBaseDilation(window)) {
     SetAndSanitizeName(StrCat(name(), "-base-dilated"));
   }
@@ -1614,6 +1649,7 @@
   }
   extra.push_back(StrCat("dim_labels=", ConvolutionDimensionNumbersToString(
                                             convolution_dimension_numbers_)));
+  extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   return extra;
 }
 
@@ -1635,9 +1671,9 @@
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloConvolutionInstruction>(shape, new_operands[0],
-                                               new_operands[1], window(),
-                                               convolution_dimension_numbers_);
+  return MakeUnique<HloConvolutionInstruction>(
+      shape, new_operands[0], new_operands[1], window(),
+      convolution_dimension_numbers_, feature_group_count_);
 }
 
 HloReduceWindowInstruction::HloReduceWindowInstruction(
@@ -1929,51 +1965,50 @@
 }
 
 HloGatherInstruction::HloGatherInstruction(
-    const Shape& shape, HloInstruction* operand, HloInstruction* gather_indices,
+    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds)
+    tensorflow::gtl::ArraySlice<int64> slice_sizes)
     : HloInstruction(HloOpcode::kGather, shape) {
   AppendOperand(operand);
-  AppendOperand(gather_indices);
+  AppendOperand(start_indices);
   gather_dimension_numbers_ =
       MakeUnique<GatherDimensionNumbers>(gather_dim_numbers);
-  c_copy(window_bounds, std::back_inserter(gather_window_bounds_));
+  c_copy(slice_sizes, std::back_inserter(gather_slice_sizes_));
 }
 
 string HloGatherInstruction::GatherDimensionNumbersToString() const {
   CHECK(gather_dimension_numbers_ != nullptr);
-  string output_window_dims =
-      StrCat("output_window_dims={",
-             Join(gather_dimension_numbers_->output_window_dims(), ","), "}");
-  string elided_window_dims =
-      StrCat("elided_window_dims={",
-             Join(gather_dimension_numbers_->elided_window_dims(), ","), "}");
-  string gather_dims_to_operand_dims = StrCat(
-      "gather_dims_to_operand_dims={",
-      Join(gather_dimension_numbers_->gather_dims_to_operand_dims(), ","), "}");
+  string offset_dims =
+      StrCat("offset_dims={",
+             Join(gather_dimension_numbers_->offset_dims(), ","), "}");
+  string collapsed_slice_dims =
+      StrCat("collapsed_slice_dims={",
+             Join(gather_dimension_numbers_->collapsed_slice_dims(), ","), "}");
+  string start_index_map =
+      StrCat("start_index_map={",
+             Join(gather_dimension_numbers_->start_index_map(), ","), "}");
   string index_vector_dim = StrCat(
       "index_vector_dim=", gather_dimension_numbers_->index_vector_dim());
 
   return Join<std::initializer_list<string>>(
-      {output_window_dims, elided_window_dims, gather_dims_to_operand_dims,
-       index_vector_dim},
+      {offset_dims, collapsed_slice_dims, start_index_map, index_vector_dim},
       ", ");
 }
 
 /* static */ GatherDimensionNumbers HloGatherInstruction::MakeGatherDimNumbers(
-    tensorflow::gtl::ArraySlice<int64> output_window_dims,
-    tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-    tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+    tensorflow::gtl::ArraySlice<int64> offset_dims,
+    tensorflow::gtl::ArraySlice<int64> collapsed_slice_dims,
+    tensorflow::gtl::ArraySlice<int64> start_index_map,
     int64 index_vector_dim) {
   GatherDimensionNumbers gather_dim_numbers;
-  for (int64 output_window_dim : output_window_dims) {
-    gather_dim_numbers.add_output_window_dims(output_window_dim);
+  for (int64 output_window_dim : offset_dims) {
+    gather_dim_numbers.add_offset_dims(output_window_dim);
   }
-  for (int64 elided_window_dim : elided_window_dims) {
-    gather_dim_numbers.add_elided_window_dims(elided_window_dim);
+  for (int64 elided_window_dim : collapsed_slice_dims) {
+    gather_dim_numbers.add_collapsed_slice_dims(elided_window_dim);
   }
-  for (int64 gather_dim_to_input_dim : gather_dims_to_operand_dims) {
-    gather_dim_numbers.add_gather_dims_to_operand_dims(gather_dim_to_input_dim);
+  for (int64 gather_dim_to_input_dim : start_index_map) {
+    gather_dim_numbers.add_start_index_map(gather_dim_to_input_dim);
   }
 
   gather_dim_numbers.set_index_vector_dim(index_vector_dim);
@@ -1983,8 +2018,8 @@
 HloInstructionProto HloGatherInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   *proto.mutable_gather_dimension_numbers() = gather_dimension_numbers();
-  for (int64 bound : gather_window_bounds()) {
-    proto.add_gather_window_bounds(bound);
+  for (int64 bound : gather_slice_sizes()) {
+    proto.add_gather_slice_sizes(bound);
   }
   return proto;
 }
@@ -1992,7 +2027,7 @@
 std::vector<string> HloGatherInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   return {GatherDimensionNumbersToString(),
-          StrCat("window_bounds={", Join(gather_window_bounds(), ","), "}")};
+          StrCat("slice_sizes={", Join(gather_slice_sizes(), ","), "}")};
 }
 
 bool HloGatherInstruction::IdenticalSlowPath(
@@ -2003,7 +2038,7 @@
   return protobuf_util::ProtobufEquals(
              gather_dimension_numbers(),
              casted_other.gather_dimension_numbers()) &&
-         gather_window_bounds() == casted_other.gather_window_bounds();
+         gather_slice_sizes() == casted_other.gather_slice_sizes();
 }
 
 std::unique_ptr<HloInstruction> HloGatherInstruction::CloneWithNewOperandsImpl(
@@ -2013,7 +2048,7 @@
   CHECK_EQ(new_operands.size(), 2);
   return MakeUnique<HloGatherInstruction>(
       shape, new_operands[0], new_operands[1], gather_dimension_numbers(),
-      gather_window_bounds());
+      gather_slice_sizes());
 }
 
 HloScatterInstruction::HloScatterInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index b038822..803dbea 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -273,6 +273,47 @@
   tensorflow::gtl::optional<int64> all_reduce_id_;
 };
 
+class HloAllToAllInstruction : public HloInstruction {
+ public:
+  explicit HloAllToAllInstruction(
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operand,
+      const std::vector<ReplicaGroup>& replica_groups,
+      tensorflow::StringPiece barrier);
+
+  const std::vector<ReplicaGroup>& replica_groups() const {
+    return replica_groups_;
+  }
+
+  // TODO(b/110096724): rename this.
+  void set_cross_replica_sum_barrier(string barrier) {
+    cross_replica_sum_barrier_ = barrier;
+  }
+  string cross_replica_sum_barrier() const {
+    return cross_replica_sum_barrier_;
+  }
+
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<ReplicaGroup> replica_groups_;
+
+  // The string representation of the barrier config.
+  string cross_replica_sum_barrier_;
+};
+
 class HloReverseInstruction : public HloInstruction {
  public:
   explicit HloReverseInstruction(const Shape& shape, HloInstruction* operand,
@@ -340,6 +381,18 @@
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns the input tensors to be reduced.
+  tensorflow::gtl::ArraySlice<HloInstruction*> inputs() const {
+    return tensorflow::gtl::ArraySlice<HloInstruction*>(operands(), 0,
+                                                        operand_count() / 2);
+  }
+
+  // Returns the init values of the reduction.
+  tensorflow::gtl::ArraySlice<HloInstruction*> init_values() const {
+    return tensorflow::gtl::ArraySlice<HloInstruction*>(
+        operands(), operand_count() / 2, operand_count());
+  }
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -830,10 +883,6 @@
   explicit HloInfeedInstruction(const Shape& infeed_shape,
                                 HloInstruction* token_operand,
                                 const string& config);
-  // TODO(b/80000000): Remove this constructor when all uses of infeed are
-  // converted to take tokens.
-  explicit HloInfeedInstruction(const Shape& infeed_shape,
-                                const string& config);
   // Returns the infeed configuration string. The infeed configuration includes
   // any metadata needed for the backend compiler (e.g., infeed buffer address)
   // and is target-dependent.
@@ -872,12 +921,6 @@
                                  HloInstruction* operand,
                                  HloInstruction* token_operand,
                                  tensorflow::StringPiece outfeed_config);
-  // TODO(b/80000000): Remove this constructor when all uses of outfeed are
-  // converted to take tokens.
-  explicit HloOutfeedInstruction(const Shape& outfeed_shape,
-                                 HloInstruction* operand,
-                                 tensorflow::StringPiece outfeed_config);
-
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const {
     TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape_));
@@ -912,7 +955,8 @@
   explicit HloConvolutionInstruction(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
       const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count);
   const Window& window() const override { return window_; }
   void set_window(const Window& window) override { window_ = window; }
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
@@ -922,6 +966,9 @@
       const ConvolutionDimensionNumbers& dnums) {
     convolution_dimension_numbers_ = dnums;
   }
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count() const { return feature_group_count_; }
   string ToCategory() const override;
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -941,6 +988,9 @@
   Window window_;
   // Describes the dimension numbers used for a convolution.
   ConvolutionDimensionNumbers convolution_dimension_numbers_;
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count_;
 };
 
 class HloReduceWindowInstruction : public HloInstruction {
@@ -1162,15 +1212,15 @@
  public:
   explicit HloGatherInstruction(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* gather_indices,
+      HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
   const GatherDimensionNumbers& gather_dimension_numbers() const {
     CHECK(gather_dimension_numbers_ != nullptr);
     return *gather_dimension_numbers_;
   }
-  tensorflow::gtl::ArraySlice<int64> gather_window_bounds() const {
-    return gather_window_bounds_;
+  tensorflow::gtl::ArraySlice<int64> gather_slice_sizes() const {
+    return gather_slice_sizes_;
   }
   // Returns the dump string of the gather dimension numbers.
   string GatherDimensionNumbersToString() const;
@@ -1179,9 +1229,9 @@
 
   // Creates an instance of GatherDimensionNumbers.
   static GatherDimensionNumbers MakeGatherDimNumbers(
-      tensorflow::gtl::ArraySlice<int64> output_window_dims,
-      tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-      tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+      tensorflow::gtl::ArraySlice<int64> offset_dims,
+      tensorflow::gtl::ArraySlice<int64> collapsed_slice_dims,
+      tensorflow::gtl::ArraySlice<int64> start_index_map,
       int64 index_vector_dim);
 
  private:
@@ -1197,7 +1247,7 @@
       HloCloneContext* context) const override;
 
   std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
-  std::vector<int64> gather_window_bounds_;
+  std::vector<int64> gather_slice_sizes_;
 };
 
 class HloScatterInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 71b4450..8e0d38b 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -143,8 +143,47 @@
         return TokKind::kLparen;
       case ')':
         return TokKind::kRparen;
-      case '/':
-        return LexComment();
+      case '/': {
+        if (PeekCurrentChar() == '*') {
+          // This is the start of a /*...*/ delimited comment. Save the current
+          // location in case the comment is unterminated so the error message
+          // will point to the beginning of the comment.
+          const char* comment_start = current_ptr_;
+          current_ptr_++;
+          // Advance until '*/' is found.
+          while (true) {
+            int current = GetNextChar();
+            if (current == '*' && PeekCurrentChar() == '/') {
+              // End of comment.
+              current_ptr_++;
+              break;
+            }
+            if (current == kEOF) {
+              // Unterminated comment.
+              current_ptr_ = comment_start;
+              return TokKind::kError;
+            }
+          }
+          // Return no token for the comment. Keep lexing.
+          continue;
+        } else if (PeekCurrentChar() == '/') {
+          // This is the start of a '//' delimited comment. Throw away
+          // everything until end of line or file. The end-of-line character(s)
+          // are left unlexed in the buffer which is harmless because these are
+          // skipped later by the lexer. This approach enables support for
+          // different end-of-line encodings.
+          while (true) {
+            int current = PeekCurrentChar();
+            if (current == kEOF || current == '\n' || current == '\r') {
+              break;
+            }
+            current_ptr_++;
+          }
+          continue;
+        }
+        // A lone '/' is an error.
+        return TokKind::kError;
+      }
       case '"':
         return LexString();
     }
@@ -357,16 +396,6 @@
   return StringPieceFromPointers(start, end);
 }
 
-TokKind HloLexer::LexComment() {
-  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
-  static LazyRE2 comment_pattern = {R"(\/\*.*?\*\/)"};
-  if (RE2::Consume(&consumable, *comment_pattern)) {
-    current_ptr_ = consumable.begin();
-    return TokKind::kComment;
-  }
-  return TokKind::kError;
-}
-
 // Lexes quoted string with escaping characters. If matched, the quoted string
 // will be unescaped and stored to str_val_.
 TokKind HloLexer::LexString() {
@@ -412,8 +441,6 @@
       return "kRparen";
     case TokKind::kArrow:
       return "kArrow";
-    case TokKind::kComment:
-      return "kComment";
     case TokKind::kw_HloModule:
       return "kw_HloModule";
     case TokKind::kw_ENTRY:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index ceb674f..003ac34 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -105,7 +105,6 @@
   TokKind LexShape();
   TokKind LexConstant();
   TokKind LexNumberOrPattern();
-  TokKind LexComment();
   TokKind LexString();
 
   const tensorflow::StringPiece buf_;
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index b57c940..c577b43 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -231,6 +231,7 @@
 HLO_MATCHER(Trace);
 HLO_MATCHER(Transpose);
 HLO_MATCHER(Tuple);
+HLO_MATCHER(TupleSelect);
 HLO_MATCHER(While);
 
 // The special cases below let you check additional information about the
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 7de59ac..7961aec 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -157,9 +157,8 @@
   Array<int64> assignment({2});
   assignment.SetValues({0, 1});
   auto sharding = HloSharding::Tuple(
-      tuple_shape,
-      {HloSharding::Tile(ShapeUtil::MakeShape(F32, {5}), assignment),
-       HloSharding::AssignDevice(1), HloSharding::Replicate()});
+      tuple_shape, {HloSharding::Tile(assignment), HloSharding::AssignDevice(1),
+                    HloSharding::Replicate()});
   p2->set_sharding(sharding);
 
   EXPECT_THAT(p0.get(), op::NoSharding());
@@ -172,8 +171,7 @@
 
   EXPECT_THAT(
       p2.get(),
-      op::Sharding(
-          "{{f32[5] devices=[2]0,1}, {maximal device=1}, {replicated}}"));
+      op::Sharding("{{devices=[2]0,1}, {maximal device=1}, {replicated}}"));
 
   EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))),
               "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: "
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 84f2d3f..1b256cd 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -166,7 +166,7 @@
   //
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::unordered_set<HloInstruction*>& Companions(
-      HloInstruction* instruction) const {
+      const HloInstruction* instruction) const {
     CHECK_EQ(companion_set_index_.count(instruction), 1);
     return companion_set(companion_set_index_.at(instruction));
   }
@@ -243,7 +243,7 @@
       companion_sets_;
 
   // Map from each companion while instruction to the index into companion_set_.
-  tensorflow::gtl::FlatMap<HloInstruction*, int64> companion_set_index_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> companion_set_index_;
 
   // Map from computation to the instruction using it (a kWhile, kConditional).
   tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 9fd0ade..0dc5676 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -37,24 +38,38 @@
 
 std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
     HloInstruction* instruction) {
-  std::vector<HloInstruction*> predecessors;
+  std::vector<HloInstruction*>
+      predecessors;  // Use a vector to avoid non-determinism.
+  tensorflow::gtl::FlatSet<HloInstruction*> unique;
 
-  // Adds to the unique predecessors list and also add companion instructions
-  // if the given predecessor has those.
+  // Adds to the unique predecessors list; if the predecessors is a companion
+  // instruction, also add companion instructions; if the predecessors is a
+  // cross-module all-reduce, also add the all-reduce instructions in the same
+  // group.
   auto add_unique_predecessor = [&](HloInstruction* predecessor) {
-    if (std::find(predecessors.begin(), predecessors.end(), predecessor) !=
-        predecessors.end()) {
+    if (unique.find(predecessor) != unique.end()) {
       return;
     }
-    if (!metadata_.IsCompanionInstruction(predecessor)) {
-      predecessors.push_back(predecessor);
+    if (metadata_.IsCompanionInstruction(predecessor)) {
+      for (HloInstruction* instr : metadata_.Companions(predecessor)) {
+        if (unique.insert(instr).second) {
+          predecessors.push_back(instr);
+        }
+      }
       return;
     }
-    for (HloInstruction* companion : metadata_.Companions(predecessor)) {
-      predecessors.push_back(companion);
+    if (predecessor->IsCrossModuleAllReduce()) {
+      for (HloInstruction* instr :
+           metadata_.GetAllReduceGroup(*predecessor->all_reduce_id())) {
+        if (unique.insert(instr).second) {
+          predecessors.push_back(instr);
+        }
+      }
+      return;
     }
+    unique.insert(predecessor);
+    predecessors.push_back(predecessor);
   };
-
   // If the given instruction is a companion instruction, we need to find the
   // predecessors of all of its companion instructions. If the instruction is an
   // all-reduce, we need to find the predecessors of all the peer all-reduce
@@ -98,22 +113,37 @@
 
 std::vector<HloInstruction*> HloModuleGroupUtil::GlobalSuccessors(
     HloInstruction* instruction) {
-  std::vector<HloInstruction*> successors;
+  std::vector<HloInstruction*>
+      successors;  // Use a vector to avoid non-determinism.
+  tensorflow::gtl::FlatSet<HloInstruction*> unique;
 
-  // Adds to the unique successors list and also add companion instructions
-  // if the given successor has those.
+  // Adds to the unique successors list; if the successor is a companion
+  // instruction, also add companion instructions; if the successor is a
+  // cross-module all-reduce, also add the all-reduce instructions in the same
+  // group.
   auto add_unique_successor = [&](HloInstruction* successor) {
-    if (std::find(successors.begin(), successors.end(), successor) !=
-        successors.end()) {
+    if (unique.find(successor) != unique.end()) {
       return;
     }
-    if (!metadata_.IsCompanionInstruction(successor)) {
-      successors.push_back(successor);
+    if (metadata_.IsCompanionInstruction(successor)) {
+      for (HloInstruction* instr : metadata_.Companions(successor)) {
+        if (unique.insert(instr).second) {
+          successors.push_back(instr);
+        }
+      }
       return;
     }
-    for (HloInstruction* companion : metadata_.Companions(successor)) {
-      successors.push_back(companion);
+    if (successor->IsCrossModuleAllReduce()) {
+      for (HloInstruction* instr :
+           metadata_.GetAllReduceGroup(*successor->all_reduce_id())) {
+        if (unique.insert(instr).second) {
+          successors.push_back(instr);
+        }
+      }
+      return;
     }
+    unique.insert(successor);
+    successors.push_back(successor);
   };
 
   // If the given instruction is a companion instruction, we need to find the
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 88531b6..ec27986 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -47,6 +47,7 @@
 #define HLO_OPCODE_LIST(V)                                   \
   V(kAbs, "abs")                                             \
   V(kAdd, "add")                                             \
+  V(kAllToAll, "all-to-all")                                 \
   V(kAtan2, "atan2")                                         \
   V(kBatchNormGrad, "batch-norm-grad")                       \
   V(kBatchNormInference, "batch-norm-inference")             \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 93cc884..ab57a8b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -125,6 +125,7 @@
     kFloat,
     kString,
     kBracedInt64List,
+    kBracedInt64ListList,
     kHloComputation,
     kFftType,
     kWindow,
@@ -205,6 +206,10 @@
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim,
                       std::vector<tensorflow::int64>* result);
+  // 'parse_and_add_item' is an lambda to parse an element in the list and add
+  // the parsed element to the result. It's supposed to capture the result.
+  bool ParseList(const TokKind start, const TokKind end, const TokKind delim,
+                 const std::function<bool()>& parse_and_add_item);
 
   bool ParseParamListToShape(Shape* shape, LocTy* shape_loc);
   bool ParseParamList();
@@ -619,6 +624,28 @@
       }
       break;
     }
+    case HloOpcode::kAllToAll: {
+      optional<std::vector<std::vector<int64>>> tmp_groups;
+      optional<string> barrier;
+      attrs["replica_groups"] = {/*required=*/false,
+                                 AttrTy::kBracedInt64ListList, &tmp_groups};
+      attrs["barrier"] = {/*required=*/false, AttrTy::kString, &barrier};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      std::vector<ReplicaGroup> replica_groups;
+      if (tmp_groups) {
+        c_transform(*tmp_groups, std::back_inserter(replica_groups),
+                    [](const std::vector<int64>& ids) {
+                      ReplicaGroup group;
+                      *group.mutable_replica_ids() = {ids.begin(), ids.end()};
+                      return group;
+                    });
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateAllToAll(
+          shape, operands, replica_groups, barrier ? *barrier : ""));
+      break;
+    }
     case HloOpcode::kReshape: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -798,9 +825,12 @@
     case HloOpcode::kConvolution: {
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
+      optional<int64> feature_group_count;
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["dim_labels"] = {/*required=*/true,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
+      attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
+                                      &feature_group_count};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
@@ -808,8 +838,12 @@
       if (!window) {
         window.emplace();
       }
+      if (!feature_group_count) {
+        feature_group_count = 1;
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
-          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
+          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums,
+          feature_group_count.value()));
       break;
     }
     case HloOpcode::kFft: {
@@ -1046,7 +1080,8 @@
     case HloOpcode::kInfeed: {
       optional<string> config;
       attrs["infeed_config"] = {/*required=*/false, AttrTy::kString, &config};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       // We need to know the infeed data shape to construct the infeed
@@ -1058,41 +1093,21 @@
         return Error(lexer_.GetLoc(),
                      "infeed must have a non-empty tuple shape");
       }
-
-      if (operands.empty()) {
-        // TODO(b/80000000): Remove this when all uses of infeed are
-        // converted to take tokens.
-        instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
-            ShapeUtil::GetTupleElementShape(shape, 0), config ? *config : ""));
-      } else if (operands.size() == 1) {
-        instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
-            ShapeUtil::GetTupleElementShape(shape, 0), operands[0],
-            config ? *config : ""));
-      } else {
-        return Error(lexer_.GetLoc(),
-                     "infeed must have exactly zero or one operands");
-      }
+      instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
+          ShapeUtil::GetTupleElementShape(shape, 0), operands[0],
+          config ? *config : ""));
       break;
     }
     case HloOpcode::kOutfeed: {
       optional<string> config;
       attrs["outfeed_config"] = {/*required=*/false, AttrTy::kString, &config};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
-      if (operands.size() == 1) {
-        // TODO(b/80000000): Remove this when all uses of outfeed are
-        // converted to take tokens.
-        instruction = builder->AddInstruction(HloInstruction::CreateOutfeed(
-            operands[0]->shape(), operands[0], config ? *config : ""));
-      } else if (operands.size() == 2) {
-        instruction = builder->AddInstruction(
-            HloInstruction::CreateOutfeed(operands[0]->shape(), operands[0],
-                                          operands[1], config ? *config : ""));
-      } else {
-        return Error(lexer_.GetLoc(),
-                     "outfeed must have exactly one or two operands");
-      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateOutfeed(operands[0]->shape(), operands[0],
+                                        operands[1], config ? *config : ""));
       break;
     }
     case HloOpcode::kRng: {
@@ -1218,22 +1233,21 @@
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<tensorflow::int64>> output_window_dims;
-      attrs["output_window_dims"] = {
-          /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims};
-      optional<std::vector<tensorflow::int64>> elided_window_dims;
-      attrs["elided_window_dims"] = {
-          /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims};
-      optional<std::vector<tensorflow::int64>> gather_dims_to_operand_dims;
-      attrs["gather_dims_to_operand_dims"] = {/*required=*/true,
-                                              AttrTy::kBracedInt64List,
-                                              &gather_dims_to_operand_dims};
+      optional<std::vector<tensorflow::int64>> offset_dims;
+      attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                              &offset_dims};
+      optional<std::vector<tensorflow::int64>> collapsed_slice_dims;
+      attrs["collapsed_slice_dims"] = {
+          /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims};
+      optional<std::vector<tensorflow::int64>> start_index_map;
+      attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                                  &start_index_map};
       optional<tensorflow::int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<tensorflow::int64>> window_bounds;
-      attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                                &window_bounds};
+      optional<std::vector<tensorflow::int64>> slice_sizes;
+      attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                              &slice_sizes};
 
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
@@ -1242,14 +1256,14 @@
 
       GatherDimensionNumbers dim_numbers =
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/*output_window_dims,
-              /*elided_window_dims=*/*elided_window_dims,
-              /*gather_dims_to_operand_dims=*/*gather_dims_to_operand_dims,
+              /*offset_dims=*/*offset_dims,
+              /*collapsed_slice_dims=*/*collapsed_slice_dims,
+              /*start_index_map=*/*start_index_map,
               /*index_vector_dim=*/*index_vector_dim);
 
       instruction = builder->AddInstruction(HloInstruction::CreateGather(
-          shape, /*operand=*/operands[0], /*gather_indices=*/operands[1],
-          dim_numbers, *window_bounds));
+          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          dim_numbers, *slice_sizes));
       break;
     }
     case HloOpcode::kScatter: {
@@ -1383,7 +1397,6 @@
   bool replicated = false;
   std::vector<tensorflow::int64> devices;
   std::vector<tensorflow::int64> tile_assignment_dimensions;
-  Shape tile_shape;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
       case TokKind::kw_maximal:
@@ -1434,7 +1447,8 @@
         break;
       }
       case TokKind::kShape:
-        tile_shape = lexer_.GetShapeVal();
+        // TODO(b/112302613): Left here for backward compatibility to ignore the
+        // removed tile shape data.
         lexer_.Lex();
         break;
       case TokKind::kRbrace:
@@ -1449,19 +1463,12 @@
       return Error(loc,
                    "replicated shardings should not have any devices assigned");
     }
-    if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return Error(loc,
-                   "replicated shardings should not have any tile shape set");
-    }
     sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
   } else if (maximal) {
     if (devices.size() != 1) {
       return Error(loc,
                    "maximal shardings should have exactly one device assigned");
     }
-    if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return Error(loc, "maximal shardings should not have any tile shape set");
-    }
     sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
     sharding->add_tile_assignment_devices(devices[0]);
   } else {
@@ -1469,9 +1476,6 @@
       return Error(
           loc, "non-maximal shardings must have more than one device assigned");
     }
-    if (ShapeUtil::Equal(tile_shape, Shape())) {
-      return Error(loc, "non-maximal shardings should have a tile shape set");
-    }
     if (tile_assignment_dimensions.empty()) {
       return Error(
           loc,
@@ -1479,7 +1483,6 @@
           "dimensions");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    *sharding->mutable_tile_shape() = tile_shape;
     for (tensorflow::int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
@@ -1808,7 +1811,6 @@
         break;
       }
       case TokKind::kComma:
-      case TokKind::kComment:
         // Skip.
         lexer_.Lex();
         break;
@@ -2255,6 +2257,26 @@
             ->emplace(result);
         return true;
       }
+      case AttrTy::kBracedInt64ListList: {
+        std::vector<std::vector<tensorflow::int64>> result;
+        auto parse_and_add_item = [&]() {
+          std::vector<tensorflow::int64> item;
+          if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace,
+                              TokKind::kComma, &item)) {
+            return false;
+          }
+          result.push_back(item);
+          return true;
+        };
+        if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                       parse_and_add_item)) {
+          return false;
+        }
+        static_cast<optional<std::vector<std::vector<tensorflow::int64>>>*>(
+            attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
       case AttrTy::kSliceRanges: {
         SliceRanges result;
         if (!ParseSliceRanges(&result)) {
@@ -2597,6 +2619,26 @@
       end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
 }
 
+bool HloParser::ParseList(const TokKind start, const TokKind end,
+                          const TokKind delim,
+                          const std::function<bool()>& parse_and_add_item) {
+  if (!ParseToken(start, StrCat("expects a list starting with ",
+                                TokKindToString(start)))) {
+    return false;
+  }
+  if (lexer_.GetKind() == end) {
+    // empty
+  } else {
+    do {
+      if (!parse_and_add_item()) {
+        return false;
+      }
+    } while (EatIfPresent(delim));
+  }
+  return ParseToken(
+      end, StrCat("expects a list to end with ", TokKindToString(end)));
+}
+
 // param_list_to_shape ::= param_list '->' shape
 bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) {
   if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 7344679..0d79193 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -380,7 +380,7 @@
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=1
 }
 
 )"
@@ -393,7 +393,7 @@
 ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
   %input = f32[1,2]{1,0} parameter(0)
   %filter = f32[1,1]{1,0} parameter(1)
-  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf
+  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf, feature_group_count=1
 }
 
 )"
@@ -406,7 +406,7 @@
 ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
   %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
   %filter = f32[3,3,512,512]{3,2,1,0} parameter(1)
-  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f, feature_group_count=1
 }
 
 )"
@@ -752,10 +752,10 @@
 "gather",
 R"(HloModule StringifyGather
 
-ENTRY %Gather (input_tensor: f32[50,49,48,47,46], gather_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
+ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
   %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
-  %gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
-  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+  %start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26}
 }
 
 )"
@@ -1030,8 +1030,8 @@
 
 ENTRY Gather {
   input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
-  gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
-  ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+  start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26}
 }
 
 )"
@@ -1072,6 +1072,30 @@
 
 )"
 },
+// all-to-all
+{
+"AllToAll",
+R"(HloModule AllToAll
+
+ENTRY AllToAll {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={}
+}
+
+)"
+},
+// all-to-all with subgroups
+{
+"AllToAllWithSubgroups",
+R"(HloModule AllToAllWithSubgroups
+
+ENTRY AllToAllWithSubgroups {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={{1,2},{3,0}}, barrier="abc"
+}
+
+)"
+},
 // Iota
 {
 "Iota",
@@ -1346,7 +1370,7 @@
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), feature_group_count=1, sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
 }
 
 )";
@@ -1536,6 +1560,81 @@
       "last");
 }
 
+TEST_F(HloParserTest, Comments) {
+  const string original = R"(/* module description. */
+HloModule comments:
+
+ENTRY /*comment*/ c1 {
+  /* blah */
+  ROOT const1 = /*foo*/f32[1]{0} constant({12345 /*bar*/})
+  /* comment */
+}
+
+/* something else */
+
+)";
+  auto module = ParseHloString(original);
+  TF_ASSERT_OK(module.status());
+}
+
+TEST_F(HloParserTest, MultilineComments) {
+  const string original = R"(HloModule multiline_comment:
+ENTRY c1 {
+  /*
+     ROOT foo = f32[1]{0} constant({12345})
+  */
+  ROOT const1 = f32[1]{0} constant({12345})
+/*
+a
+b
+c
+d
+
+*/
+})";
+  auto module = ParseHloString(original);
+  TF_ASSERT_OK(module.status());
+}
+
+TEST_F(HloParserTest, UnterminatedComment) {
+  const string original = R"(HloModule unterminated_comment:
+ENTRY c1 {
+/* unterminated
+  ROOT const1 = f32[1]{0} constant({12345})
+})";
+  // Verify that the error message points to the beginning of the unterminated
+  // comment.
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "/* unterminated\n^");
+}
+
+TEST_F(HloParserTest, SlashSlashComments) {
+  const string original = R"(HloModule slash_slash_comment:
+// Garbage
+ENTRY c1 {
+  // Foo bar
+  ROOT const1 = f32[1]{0} constant({12345}) // Something else
+})";
+  auto module = ParseHloString(original);
+  TF_ASSERT_OK(module.status());
+}
+
+TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) {
+  const string original =
+      "HloModule slash_slash_comment:\r\n// Garbage\r\nENTRY c1 {\r\n// Foo "
+      "bar\r\nROOT const1 = f32[1]{0} constant({12345}) // Something else\r\n}";
+  auto module = ParseHloString(original);
+  TF_ASSERT_OK(module.status());
+}
+
+TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) {
+  const string original =
+      "HloModule slash_slash_comment:\r// Garbage\rENTRY c1 {\r// Foo "
+      "bar\rROOT const1 = f32[1]{0} constant({12345}) // Something else\r}";
+  auto module = ParseHloString(original);
+  TF_ASSERT_OK(module.status());
+}
+
 TEST_F(HloParserTest, MultipleEntries) {
   const string original = R"(HloModule multiple_entries:
 ENTRY c1 {
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 28194de..791b1a9 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -45,7 +45,7 @@
       ++iteration_count;
       if (iteration_count == limit) {
         LOG(ERROR)
-            << "Unexpectedly number of iterations in HLO passes ("
+            << "Unexpectedly high number of iterations in HLO passes ("
             << iteration_count
             << ")\nIf compilation hangs here, please file a bug with XLA.";
       }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 6399f6e..0cba9eb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -31,12 +31,9 @@
   CHECK_EQ(1, ShapeUtil::Rank(input_shape));
   CHECK_GT(num_tiles, 1);
   std::vector<int64> dimensions(1, num_tiles);
-  Shape tile_shape = input_shape;
-  auto& tile_dimension = (*tile_shape.mutable_dimensions())[0];
-  tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
   Array<int64> assignment(dimensions);
   std::iota(assignment.begin(), assignment.end(), 0);
-  return HloSharding(tile_shape, assignment);
+  return HloSharding(assignment);
 }
 
 HloSharding HloSharding::Tuple(const ShapeTree<HloSharding>& sub_shardings) {
@@ -104,8 +101,7 @@
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   } else {
-    return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ", "devices=[",
-                  Join(tile_assignment_.dimensions(), ","), "]",
+    return StrCat("{devices=[", Join(tile_assignment_.dimensions(), ","), "]",
                   Join(tile_assignment_, ","), "}");
   }
 }
@@ -145,7 +141,6 @@
 }
 
 std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
-  CHECK(!ShapeUtil::IsTuple(tile_shape_));
   CHECK(!maximal_);
   CHECK(!IsTuple());
   std::vector<int64> ret_index;
@@ -165,32 +160,43 @@
   if (maximal_) {
     return *tile_assignment_.begin();
   }
-  CHECK_EQ(ShapeUtil::Rank(tile_shape_), tile_assignment_.dimensions().size());
   return tile_assignment_(index);
 }
 
-std::vector<int64> HloSharding::TileOffsetForDevice(int64 device) const {
+std::vector<int64> HloSharding::TileOffsetForDevice(const Shape& shape,
+                                                    int64 device) const {
   CHECK(!IsTuple());
 
-  std::vector<int64> index = TileIndexForDevice(device);
   if (maximal_) {
-    // Index will always be all zeroes if we're maximal, and tile_shape_ is not
-    // valid.
-    return index;
+    return std::vector<int64>(shape.dimensions_size(), 0);
   }
+
+  CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
+  std::vector<int64> index = TileIndexForDevice(device);
   for (int64 i = 0; i < index.size(); ++i) {
-    index[i] *= tile_shape_.dimensions(i);
+    const int64 shape_dim = shape.dimensions(i);
+    index[i] = std::min(
+        index[i] * CeilOfRatio(shape_dim, tile_assignment_.dim(i)), shape_dim);
   }
   return index;
 }
 
-std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
+std::vector<int64> HloSharding::TileLimitForDevice(const Shape& shape,
+                                                   int64 device) const {
   CHECK(!IsTuple());
-  CHECK(!maximal_);  // Maximal shardings do not have a valid tile shape.
 
+  if (maximal_) {
+    return std::vector<int64>(shape.dimensions().begin(),
+                              shape.dimensions().end());
+  }
+
+  CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions());
   std::vector<int64> index = TileIndexForDevice(device);
   for (int64 i = 0; i < index.size(); ++i) {
-    index[i] = (index[i] + 1) * tile_shape_.dimensions(i);
+    const int64 shape_dim = shape.dimensions(i);
+    index[i] = std::min(
+        (index[i] + 1) * CeilOfRatio(shape_dim, tile_assignment_.dim(i)),
+        shape_dim);
   }
   return index;
 }
@@ -336,11 +342,12 @@
     return Status::OK();
   }
 
-  // The tile rank must be the same as the input rank.
-  if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) {
+  // The tile assignment tensor must have the same rank as the input.
+  if (ShapeUtil::Rank(shape) != tile_assignment_.num_dimensions()) {
     return tensorflow::errors::InvalidArgument(
-        "Tile rank is different to the input rank. sharding=", ToString(),
-        ", input_shape=", ShapeUtil::HumanString(shape));
+        "Number of tile assignment dimensions is different to the input rank. "
+        "sharding=",
+        ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
   // The correct constructor have to be used to create tile maximal shardings.
@@ -350,20 +357,6 @@
         "sharding was intended, use HloSharding::Replicated(). If a device "
         "placement was intended, use HloSharding::AssignDevice()");
   }
-
-  // The tile assignment tensor must contain enough element to cover the full
-  // shape with tiles of the specified size.
-  for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) {
-    int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i);
-    if (shape.dimensions(i) > total_tile_size) {
-      return tensorflow::errors::InvalidArgument(
-          StrCat("Tile assignment tensor has too few element to cover the full "
-                 "shape. Dimension ",
-                 i, ", shape ", shape.dimensions(i), ", total size ",
-                 total_tile_size));
-    }
-  }
-
   return Status::OK();
 }
 
@@ -393,7 +386,7 @@
                          proto.tile_assignment_dimensions().end()));
   std::copy(proto.tile_assignment_devices().begin(),
             proto.tile_assignment_devices().end(), tile_assignment.begin());
-  return HloSharding(proto.tile_shape(), tile_assignment);
+  return HloSharding(tile_assignment);
 }
 
 OpSharding HloSharding::ToProto() const {
@@ -407,7 +400,6 @@
     return result;
   }
 
-  *result.mutable_tile_shape() = tile_shape_;
   for (int64 dim : tile_assignment_.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
   }
@@ -424,30 +416,16 @@
   return result;
 }
 
-HloSharding HloSharding::TransformShardedTileShape(
-    const Shape& new_shape,
-    const std::function<int64(int64, int64)>& transform) const {
-  CHECK(!IsTuple());
+Shape HloSharding::TileShape(const Shape& shape) const {
   if (IsTileMaximal()) {
-    return *this;
+    return shape;
   }
-  CHECK_EQ(ShapeUtil::Rank(new_shape), ShapeUtil::Rank(tile_shape()));
-  Shape new_tile_shape;
-  new_tile_shape.set_element_type(tile_shape().element_type());
-  for (int64 i = 0; i < ShapeUtil::Rank(new_shape); ++i) {
-    int64 dim;
-    if (tile_assignment().dim(i) == 1) {
-      dim = new_shape.dimensions(i);
-    } else if (transform) {
-      dim = transform(i, tile_shape().dimensions(i));
-    } else {
-      dim = tile_shape().dimensions(i);
-    }
-    new_tile_shape.add_dimensions(dim);
+  Shape result_shape = shape;
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    (*result_shape.mutable_dimensions())[i] =
+        CeilOfRatio<int64>(shape.dimensions(i), tile_assignment_.dim(i));
   }
-  TF_CHECK_OK(
-      LayoutUtil::CopyLayoutBetweenShapes(tile_shape_, &new_tile_shape));
-  return HloSharding::Tile(new_tile_shape, tile_assignment());
+  return result_shape;
 }
 
 HloSharding HloSharding::GetSubSharding(const Shape& shape,
@@ -475,7 +453,7 @@
 }
 
 size_t HloSharding::Hash() const {
-  if (!tuple_) {
+  if (tuple_) {
     size_t h = 0;
     for (const auto& element : tuple_elements_) {
       h = tensorflow::Hash64Combine(h, element.Hash());
@@ -489,9 +467,6 @@
   for (uint32 v : tile_assignment_) {
     h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(v));
   }
-  for (uint32 v : tile_shape_.dimensions()) {
-    h = tensorflow::Hash64Combine(h, std::hash<uint32>{}(v));
-  }
   return h;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 28575c0..894783e 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -48,22 +48,10 @@
   // the input shape (one tile) assigned to a single device.
   static HloSharding AssignDevice(int64 device_id);
 
-  // Creates a new sharding which splits a shape into tiles each with shape
-  // `tile_shape`. Each tile is assigned to one device, which is specified by
-  // `tile_assignment`. Any tensor not a multiple of the tile size in any
-  // dimension is implicitly padded to the tile size.
-  //
-  // e.g. Tile({2, 2}, {0, 1}) on a tensor of shape {3, 2} would look like:
-  //      2     1 padding
-  //   <------><->
-  //   +----+----+
-  //   | 0  |  1 |
-  //   +----+----+
-  //
-  // Split into two tiles, one of which is implicitly padded by one.
-  static HloSharding Tile(const Shape& tile_shape,
-                          const Array<int64>& tile_assignment) {
-    return HloSharding(tile_shape, tile_assignment);
+  // Creates a new sharding which splits a shape into tiles amongst the devices
+  // specified by `tile_assignment`.
+  static HloSharding Tile(const Array<int64>& tile_assignment) {
+    return HloSharding(tile_assignment);
   }
 
   // Creates a new sharding which splits a one-dimensional input shape into
@@ -146,17 +134,18 @@
   // REQUIRES: !IsTuple()
   int64 DeviceForTileIndex(tensorflow::gtl::ArraySlice<int64> index) const;
 
-  // Given a device ID, returns the offset within the input space of the
+  // Given a device ID, returns the offset within the specified shape of the
   // tile that should be executed on the given core. This returns the lower
   // extent of the tile in the input space.
   // REQUIRES: !IsTuple()
-  std::vector<int64> TileOffsetForDevice(int64 device) const;
+  std::vector<int64> TileOffsetForDevice(const Shape& shape,
+                                         int64 device) const;
 
-  // Given a device ID, returns the limit within the input space of the
+  // Given a device ID, returns the limit within the specified shape of the
   // tile that should be executed on the given core. This returns the upper
   // extent of the tile in the input space.
   // REQUIRES: !IsTuple()
-  std::vector<int64> TileLimitForDevice(int64 device) const;
+  std::vector<int64> TileLimitForDevice(const Shape& shape, int64 device) const;
 
   // Returns the single device this op operates on. If the sharding does not
   // span a single device, the return value will be empty.
@@ -197,7 +186,6 @@
 
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
-           ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
            tile_assignment_ == other.tile_assignment_ &&
            tuple_elements_ == other.tuple_elements_;
   }
@@ -211,9 +199,6 @@
     }
   };
 
-  // Gets the tile shape.
-  // REQUIRES: !IsTileMaximal() && !IsTuple()
-  const Shape& tile_shape() const { return tile_shape_; }
   // Gets the tile assignment tensor.
   // REQUIRES: !IsReplicated() && !IsTuple()
   const Array<int64>& tile_assignment() const { return tile_assignment_; }
@@ -225,25 +210,15 @@
     return tuple_elements_;
   }
 
-  // Return a new sharding that can apply to the given new shape.
-  // If this sharding is tile-maximal, the returned sharding will be the same as
-  // this sharding. If this sharding is not tile-maximal, the returned
-  // sharding's tile size will differ:
-  //   - Non-sharded dimensions will be adapted to be the same as `new_shape`;
-  //     tile_dimension(i) = new_shape.dimensions(i);
-  //   - Sharded dimensions will be kept the same unless `transform` is supplied
-  //     in which case tile_dimension(i) = transform(i, tile_dimension(i));
-  // REQUIRES: !IsTuple().
-  HloSharding TransformShardedTileShape(
-      const Shape& new_shape,
-      const std::function<int64(int64, int64)>& transform = nullptr) const;
+  // Gets the tile shape.
+  // REQUIRES: !IsTuple()
+  Shape TileShape(const Shape& shape) const;
 
  private:
   HloSharding()
       : replicated_(true),
         maximal_(true),
         tuple_(false),
-        tile_shape_(),
         tile_assignment_({0}) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
@@ -255,15 +230,13 @@
       : replicated_(false),
         maximal_(true),
         tuple_(false),
-        tile_shape_(),
         tile_assignment_({1}, device_id) {}
-  HloSharding(const Shape& tile_shape, const Array<int64>& tile_assignment)
+  explicit HloSharding(const Array<int64>& tile_assignment)
       : replicated_(false),
         maximal_(false),
         tuple_(false),
-        tile_shape_(tile_shape),
         tile_assignment_(tile_assignment) {}
-  HloSharding(const std::vector<HloSharding>& tuple_shardings)
+  explicit HloSharding(const std::vector<HloSharding>& tuple_shardings)
       : replicated_(false),
         maximal_(false),
         tuple_(true),
@@ -286,7 +259,6 @@
   bool replicated_;
   bool maximal_;
   bool tuple_;
-  Shape tile_shape_;
   Array<int64> tile_assignment_;
   // Only non-empty when tuple_ is true, but because empty tuples are allowed
   // may also be empty even then. This is a flattened list of all the leaf
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index 94f5a3b..a2c1d39 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -158,7 +158,6 @@
 const HloSharding* GetOperandSharding(const HloInstruction* operand,
                                       const DomainMetadata::Domain& domain,
                                       const HloSharding& sharding) {
-  DCHECK_EQ(domain.reach_set.count(const_cast<HloInstruction*>(operand)), 1);
   // Here the user of operand is within the domain instruction set, and since it
   // is user of operand, we need to look into the enter_domains set. If this is
   // not a kDomain within the user domains set, then return the operand
@@ -203,10 +202,17 @@
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         const HloSharding* operand_sharding =
             GetOperandSharding(instruction->operand(i), domain, sharding);
-        if (operand_sharding != nullptr &&
-            shape_tree.element({i}) != *operand_sharding) {
-          *shape_tree.mutable_element({i}) = *operand_sharding;
-          ++tuple_assigned;
+        if (operand_sharding != nullptr) {
+          HloSharding operand_subsharding = HloSharding::Replicate();
+          if (operand_sharding == &sharding) {
+            operand_subsharding =
+                sharding.GetSubSharding(instruction->shape(), {i});
+            operand_sharding = &operand_subsharding;
+          }
+          if (shape_tree.element({i}) != *operand_sharding) {
+            *shape_tree.mutable_element({i}) = *operand_sharding;
+            ++tuple_assigned;
+          }
         }
       }
       if (tuple_assigned > 0) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index aebda56..45fc300 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -39,7 +39,6 @@
 class HloShardingTest : public HloTestBase {};
 
 TEST_F(HloShardingTest, Replicate) {
-  Shape tile_shape = ShapeUtil::MakeShape(U32, {4});
   HloSharding sharding = HloSharding::Replicate();
   EXPECT_TRUE(sharding.IsReplicated());
   EXPECT_TRUE(sharding.IsTileMaximal());
@@ -79,37 +78,22 @@
 TEST_F(HloShardingTest, Tile) {
   {
     // Test should fail because of a duplicate tile assignment.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 0, 2, 3}));
+    HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 0, 2, 3}));
     EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {4, 6}),
                                        /*num_devices=*/4));
   }
 
   {
     // Test should fail because of more devices used then `num_device`.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
+    HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3}));
     EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}),
                                        /*num_devices=*/2));
   }
 
   {
-    // Test should fail because the total tiled size in dimension 0 is 4 but we
-    // have 6 elements along that dimensions.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3}));
-    EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {6, 3}),
-                                       /*num_devices=*/4));
-  }
-
-  {
     // Test should pass.
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1}));
+    Shape shape = ShapeUtil::MakeShape(U32, {4, 5});
+    HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1}));
     EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {3, 5}),
                                    /*num_devices=*/5));
 
@@ -118,10 +102,14 @@
     EXPECT_EQ(2, sharding.DeviceForTileIndex({1, 0}));
     EXPECT_EQ(1, sharding.DeviceForTileIndex({1, 1}));
 
-    EXPECT_EQ(sharding.TileOffsetForDevice(0), (std::vector<int64>{0, 0}));
-    EXPECT_EQ(sharding.TileOffsetForDevice(3), (std::vector<int64>{0, 3}));
-    EXPECT_EQ(sharding.TileOffsetForDevice(2), (std::vector<int64>{2, 0}));
-    EXPECT_EQ(sharding.TileOffsetForDevice(1), (std::vector<int64>{2, 3}));
+    EXPECT_EQ(sharding.TileOffsetForDevice(shape, 0),
+              (std::vector<int64>{0, 0}));
+    EXPECT_EQ(sharding.TileOffsetForDevice(shape, 3),
+              (std::vector<int64>{0, 3}));
+    EXPECT_EQ(sharding.TileOffsetForDevice(shape, 2),
+              (std::vector<int64>{2, 0}));
+    EXPECT_EQ(sharding.TileOffsetForDevice(shape, 1),
+              (std::vector<int64>{2, 3}));
 
     EXPECT_FALSE(sharding.HasUniqueDevice());
   }
@@ -135,8 +123,7 @@
       ShapeUtil::MakeShape(F32, {4, 6}),
   });
 
-  HloSharding tiled_sharding = HloSharding::Tile(
-      ShapeUtil::MakeShape(F32, {4, 3}), Array<int64>({{0, 1}}));
+  HloSharding tiled_sharding = HloSharding::Tile(Array<int64>({{0, 1}}));
   OpSharding proto;
   proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
   *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto();
@@ -187,32 +174,11 @@
   }
 
   {
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding1 =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1}));
-    HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}),
-                                              MakeArray({2, 2}, {0, 3, 2, 1}));
+    HloSharding sharding1 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1}));
+    HloSharding sharding2 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1}));
     EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
   }
 
-  {
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding1 =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1}));
-    HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}),
-                                              MakeArray({2, 2}, {0, 3, 2, 1}));
-    EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
-  }
-
-  {
-    Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3});
-    HloSharding sharding1 =
-        HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1}));
-    HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}),
-                                              MakeArray({2, 2}, {0, 3, 1, 2}));
-    EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
-  }
-
   HloSharding default_sharding = HloSharding::Replicate();
   {
     ShapeTree<HloSharding> shape_tree(ShapeUtil::MakeTupleShape({}),
@@ -259,19 +225,6 @@
   }
 }
 
-TEST_F(HloShardingTest, TransformShardedTileShapeTest) {
-  HloSharding sharding =
-      HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}),
-                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
-  HloSharding result = sharding.TransformShardedTileShape(
-      ShapeUtil::MakeShape(F32, {13, 15, 17, 19}),
-      [](int dim, int value) { return dim * 111; });
-  HloSharding expected =
-      HloSharding::Tile(ShapeUtil::MakeShape(F32, {13, 15, 222, 333}),
-                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
-  EXPECT_EQ(result, expected);
-}
-
 TEST_F(HloShardingTest, ToStringReplicatedTest) {
   HloSharding sharding = HloSharding::Replicate();
   EXPECT_EQ(sharding.ToString(), "{replicated}");
@@ -284,9 +237,8 @@
 
 TEST_F(HloShardingTest, ToStringTiledTest) {
   HloSharding sharding =
-      HloSharding::Tile(ShapeUtil::MakeShape(S32, {7, 11, 13}),
-                        Array3D<int64>({{{2, 3}}, {{5, 7}}}));
-  EXPECT_EQ(sharding.ToString(), "{s32[7,11,13] devices=[2,1,2]2,3,5,7}");
+      HloSharding::Tile(Array3D<int64>({{{2, 3}}, {{5, 7}}}));
+  EXPECT_EQ(sharding.ToString(), "{devices=[2,1,2]2,3,5,7}");
 }
 
 TEST_F(HloShardingTest, ToStringTupleTest) {
@@ -294,21 +246,18 @@
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5}),
                                  ShapeUtil::MakeShape(U32, {7, 25}),
                                  ShapeUtil::MakeShape(S32, {9, 11})}),
-      {HloSharding::Replicate(),
-       HloSharding::Tile(ShapeUtil::MakeShape(U32, {7, 13}),
-                         Array2D<int64>({{3, 5}})),
+      {HloSharding::Replicate(), HloSharding::Tile(Array2D<int64>({{3, 5}})),
        HloSharding::AssignDevice(3)});
   EXPECT_EQ(sharding.ToString(),
-            "{{replicated}, {u32[7,13] devices=[1,2]3,5}, {maximal device=3}}");
+            "{{replicated}, {devices=[1,2]3,5}, {maximal device=3}}");
 }
 
 TEST_F(HloShardingTest, OstreamTest) {
   HloSharding sharding =
-      HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}),
-                        Array4D<int64>({{{{0, 1}, {2, 3}}}}));
+      HloSharding::Tile(Array4D<int64>({{{{0, 1}, {2, 3}}}}));
   std::ostringstream oss;
   oss << sharding;
-  EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}");
+  EXPECT_EQ(oss.str(), "{devices=[1,1,2,2]0,1,2,3}");
 }
 
 TEST_F(HloShardingTest, ParseHloString) {
@@ -319,8 +268,7 @@
   };
   check(HloSharding::Replicate());
   check(HloSharding::AssignDevice(2));
-  check(HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
-                          Array4D<int64>({{{{0}, {1}}}})));
+  check(HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}})));
   // Empty tuple. One sharding is required for empty tuples, as we need to be
   // able to assign sharding to them, even though they have no leaves.
   check(HloSharding::Tuple(ShapeUtil::MakeTupleShape({}),
@@ -332,8 +280,7 @@
                                    ShapeUtil::MakeShape(F32, {3, 5, 7}),
                                    ShapeUtil::MakeShape(F32, {3, 7})});
     check(HloSharding::Tuple(
-        tuple_shape, {HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
-                                        Array4D<int64>({{{{0}, {1}}}})),
+        tuple_shape, {HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}})),
                       HloSharding::Replicate(), HloSharding::AssignDevice(1)}));
   }
   {
@@ -343,8 +290,7 @@
          ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5, 7}),
                                     ShapeUtil::MakeShape(F32, {3, 7})})});
     std::vector<HloSharding> leaf_shardings = {
-        HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
-                          Array4D<int64>({{{{0}, {1}}}})),
+        HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}})),
         HloSharding::Replicate(), HloSharding::AssignDevice(1)};
     ShapeTree<HloSharding> sharding_tree(tuple_shape, HloSharding::Replicate());
     // Assign leaf_shardings to sharding_tree leaves.
diff --git a/tensorflow/compiler/xla/service/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h
index 5334296..4458c25 100644
--- a/tensorflow/compiler/xla/service/hlo_token.h
+++ b/tensorflow/compiler/xla/service/hlo_token.h
@@ -44,7 +44,6 @@
   kRparen,  // (  )
 
   kArrow,    // ->
-  kComment,  // /*xxx*/
 
   // Keywords
   kw_HloModule,
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 1a8c206..ac1a663 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -84,7 +84,8 @@
       const Shape expected,
       ShapeInference::InferConvolveShape(
           convolution->operand(0)->shape(), convolution->operand(1)->shape(),
-          convolution->window(), convolution->convolution_dimension_numbers()));
+          convolution->window(), convolution->convolution_dimension_numbers(),
+          convolution->feature_group_count()));
   return CheckShape(convolution, expected);
 }
 
@@ -105,6 +106,15 @@
                     ShapeInference::InferCrossReplicaSumShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
+  std::vector<const Shape*> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operand_shapes.push_back(&operand->shape());
+  }
+  return CheckShape(hlo,
+                    ShapeInference::InferAllToAllTupleShape(operand_shapes));
+}
+
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
@@ -147,11 +157,7 @@
 
 Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
-  // Infeed has an optional single token operand.
-  // TODO(b/80000000): Update when token is not optional.
-  if (infeed->operand_count() == 1) {
-    TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
-  }
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
   // The output of infeed is a tuple containing the data value and a token.
   return CheckShape(infeed,
@@ -161,11 +167,7 @@
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
-  // Outfeed has an optional token operand (operand 1).
-  // TODO(b/80000000): Update when token is not optional.
-  if (outfeed->operand_count() == 2) {
-    TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
-  }
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
   // Outfeed has a separate shape field for the value which is outfed to the
   // host. The shape of the instruction itself is always a token.
@@ -185,7 +187,67 @@
   return Status::OK();
 }
 
-Status ShapeVerifier::HandleRng(HloInstruction*) { return Status::OK(); }
+bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0,
+                                              const Shape& shape_1,
+                                              const Shape& result_shape) {
+  return ShapeUtil::SameElementType(shape_0, shape_1) &&
+         (ShapeUtil::SameElementType(shape_0, result_shape) ||
+          (allow_mixed_precision_ &&
+           ShapeUtil::SameElementTypeIgnoringFpPrecision(shape_0,
+                                                         result_shape)));
+}
+
+Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
+  if (instruction->operand_count() != 2) {
+    return InternalError("Expected two operands for Rng instruction: %s",
+                         instruction->ToString().c_str());
+  }
+
+  const Shape& shape_0 = instruction->operand(0)->shape();
+  const Shape& shape_1 = instruction->operand(1)->shape();
+  if (!ShapeUtil::IsScalar(shape_0) || !ShapeUtil::IsScalar(shape_1)) {
+    return InternalError(
+        "Expected scalar types for the two operands of Rng instruction: %s",
+        instruction->ToString().c_str());
+  }
+
+  if (!HasCompatibleElementTypes(shape_0, shape_1, instruction->shape())) {
+    return InternalError(
+        "Expected compatible element types for the result and the two operands"
+        " of Rng instruction: %s",
+        instruction->ToString().c_str());
+  }
+
+  PrimitiveType element_type = shape_0.element_type();
+  switch (instruction->random_distribution()) {
+    case RNG_UNIFORM:
+      if (!primitive_util::IsFloatingPointType(element_type) &&
+          !primitive_util::IsIntegralType(element_type) &&
+          element_type != PRED) {
+        return InternalError(
+            "Element type not supported."
+            " Expected element to be of floating point type, integral type or"
+            " predicate type for RngUniform: %s",
+            instruction->ToString().c_str());
+      }
+      break;
+
+    case RNG_NORMAL:
+      if (!primitive_util::IsFloatingPointType(element_type)) {
+        return InternalError(
+            "Element type not supported."
+            " Expected element to be FloatingPointType for RngNormal: %s",
+            instruction->ToString().c_str());
+      }
+      break;
+    default:
+      return InternalError(
+          "Invalid Rng distribution %s",
+          RandomDistribution_Name(instruction->random_distribution()).c_str());
+  }
+
+  return Status::OK();
+}
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
   return CheckShape(
@@ -454,9 +516,9 @@
 // inputs.
 Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
-    // White list the following opcodes for mixed-precision check, because they
-    // involve data pass through or grouping via tuples, where the precisions
-    // of buffers can be different.
+    // White list the following opcodes for mixed-precision check, because
+    // they involve data pass through or grouping via tuples, where the
+    // precisions of buffers can be different.
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kConstant:
@@ -510,7 +572,7 @@
       gather,
       ShapeInference::InferGatherShape(
           gather->operand(0)->shape(), gather->operand(1)->shape(),
-          gather->gather_dimension_numbers(), gather->gather_window_bounds()));
+          gather->gather_dimension_numbers(), gather->gather_slice_sizes()));
 }
 
 Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
@@ -638,7 +700,8 @@
 
 // Verifies various invariants about the structure of the HLO:
 //
-// (1) each instruction has a non-null parent() set to the HloComputation which
+// (1) each instruction has a non-null parent() set to the HloComputation
+// which
 //     contains it.
 //
 // (2) each computation has a non-null parent() set to the HloModule which
@@ -672,9 +735,9 @@
   }
 
   // Check that operands are in the same computation separately from verifying
-  // parent() correctness so conditions like a null HloInstruction::parent() are
-  // identified and reported explicitly above rather than reporting a mismatched
-  // operand.
+  // parent() correctness so conditions like a null HloInstruction::parent()
+  // are identified and reported explicitly above rather than reporting a
+  // mismatched operand.
   for (const HloComputation* computation : module->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       for (int i = 0; i < instruction->operand_count(); ++i) {
@@ -698,13 +761,14 @@
   HloComputation* fused_computation = fusion->fused_instructions_computation();
   if (fusion != fused_computation->FusionInstruction()) {
     return InternalError(
-        "Instruction of fused computation does not match expected instruction "
+        "Instruction of fused computation does not match expected "
+        "instruction "
         "%s.",
         fusion->ToString().c_str());
   }
 
-  // Fused root instruction and fused parameters must all be owned by the fusion
-  // computation.
+  // Fused root instruction and fused parameters must all be owned by the
+  // fusion computation.
   bool root_owned = false;
   const std::vector<HloInstruction*>& fused_parameters =
       fusion->fused_parameters();
@@ -746,8 +810,8 @@
                          fusion->ToString().c_str());
   }
 
-  // All uses of fused instructions must be in the fusion computation, and every
-  // non-root instruction must have at least one use.
+  // All uses of fused instructions must be in the fusion computation, and
+  // every non-root instruction must have at least one use.
   for (auto* instruction :
        fusion->fused_instructions_computation()->instructions()) {
     if (instruction != fused_root) {
@@ -791,7 +855,8 @@
     if (!ShapeUtil::Compatible(fused_param->shape(),
                                fusion->operand(param_no)->shape())) {
       return InternalError(
-          "Shape mismatch between parameter number %lld and its operand in %s.",
+          "Shape mismatch between parameter number %lld and its operand in "
+          "%s.",
           param_no, fusion->ToString().c_str());
     }
   }
@@ -909,8 +974,9 @@
   return Status::OK();
 }
 
-// Checks if the given two instructions have the same is_host_transfer attribute
-// value. Intsructions must be send/recv instructions or their 'done' variant.
+// Checks if the given two instructions have the same is_host_transfer
+// attribute value. Intsructions must be send/recv instructions or their
+// 'done' variant.
 Status CheckSameIsHostTransfer(const HloInstruction* instr1,
                                const HloInstruction* instr2) {
   const HloSendRecvInstruction* send_recv1 =
@@ -921,7 +987,8 @@
   TF_RET_CHECK(send_recv2 != nullptr);
   if (send_recv1->is_host_transfer() != send_recv2->is_host_transfer()) {
     return InternalError(
-        "Expected instructions to have the same is-host-transfer property: %s, "
+        "Expected instructions to have the same is-host-transfer property: "
+        "%s, "
         "%s ",
         instr1->ToString().c_str(), instr2->ToString().c_str());
   }
@@ -940,7 +1007,8 @@
           host_channels.insert({sendrecv->channel_id(), sendrecv});
       if (!it_inserted.second) {
         return FailedPrecondition(
-            "Channel %lld is used for multiple host send/recv instructions: %s "
+            "Channel %lld is used for multiple host send/recv instructions: "
+            "%s "
             "and "
             "%s",
             sendrecv->channel_id(), sendrecv->ToString().c_str(),
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 7feddae..c942fab 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -45,6 +45,7 @@
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
   Status HandleInfeed(HloInstruction*) override;
   Status HandleOutfeed(HloInstruction*) override;
@@ -105,6 +106,13 @@
   Status CheckVariadicShape(const HloInstruction* instruction);
 
  private:
+  // Return true if the shapes of the two operands have the same element type,
+  // and the result shape either has the same element type as the operand
+  // shapes or mixed precision is allowed and the result shape and the operand
+  // shapes have floating point element types.
+  bool HasCompatibleElementTypes(const Shape& shape_0, const Shape& shape_1,
+                                 const Shape& result_shape);
+
   // Whether the inputs and output of an instruction can contain both F32s and
   // BF16s. Tuples that include both F32s and BF16s are allowed regardless of
   // this flag.
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 04c6ba3..d764964 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -34,7 +34,17 @@
 
 using ::testing::HasSubstr;
 
-using HloVerifierTest = HloTestBase;
+class HloVerifierTest : public HloTestBase {
+ public:
+  HloVerifierTest()
+      : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/false) {}
+};
+
+class HloVerifierTestAllowMixedPrecision : public HloTestBase {
+ public:
+  HloVerifierTestAllowMixedPrecision()
+      : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/true) {}
+};
 
 TEST_F(HloVerifierTest, NullInstructionParent) {
   HloComputation::Builder builder(TestName());
@@ -174,5 +184,96 @@
               HasSubstr("shape does not match parameter"));
 }
 
+TEST_F(HloVerifierTest, RngOpnd0NotScalar) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY RngOpnd0NotScalar {
+   constant.0 = f32[] constant(0)
+   constant.1 = f16[2] constant({1, 3})
+   ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[2] constant.1),
+    distribution=rng_uniform
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("Expected scalar type"));
+}
+
+TEST_F(HloVerifierTest, RngOperandElementTypesDoNotMatch) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY RngOperandElementTypesNotMatch {
+   constant.0 = f32[] constant(0)
+   constant.1 = f16[] constant(1)
+   ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[] constant.1),
+    distribution=rng_normal
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected compatible element types"));
+}
+
+TEST_F(HloVerifierTest, RngMixedPrecisionNotAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY RngResultElementTypeNotMatch {
+   constant.0 = f32[] constant(0)
+   constant.1 = f32[] constant(1)
+   ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1),
+    distribution=rng_normal
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected compatible element types"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, RngMixedPrecisionAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY RngResultElementTypeNotMatch {
+   constant.0 = f32[] constant(0)
+   constant.1 = f32[] constant(1)
+   ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1),
+    distribution=rng_normal
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, RngElementTypeNotSupported) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY RngElementTypeNotSupported {
+   constant.0 = s32[] constant(0)
+   constant.1 = s32[] constant(1)
+   ROOT rng.0 = s32[10]{0} rng(s32[] constant.0, s32[] constant.1),
+    distribution=rng_normal
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("Element type not supported"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 8b2df32..8d17c03 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -153,7 +153,7 @@
     TF_ASSIGN_OR_RETURN(
         computed_array,
         ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(),
-                              instr->gather_window_bounds(),
+                              instr->gather_slice_sizes(),
                               FindOrDie(cache_, instr->operand(0)),
                               FindOrDie(cache_, instr->operand(1))));
   } else if (instr->opcode() == HloOpcode::kReshape) {
@@ -251,24 +251,23 @@
 
 StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
     const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+    tensorflow::gtl::ArraySlice<int64> slice_sizes, Array* source,
     Array* indices) {
   if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) {
     VLOG(3) << "ComputeArrayForGather: indices are not scalar";
     return nullptr;
   }
 
-  CHECK_EQ(dim_numbers.gather_dims_to_operand_dims_size(), 1);
+  CHECK_EQ(dim_numbers.start_index_map_size(), 1);
 
-  // We can also handle dim_numbers.elided_window_dims_size() == 0 here, should
-  // it become relevant.
+  // We can also handle dim_numbers.collapsed_slice_dims_size() == 0 here,
+  // should it become relevant.
 
-  if (dim_numbers.elided_window_dims_size() != 1 ||
-      dim_numbers.elided_window_dims(0) !=
-          dim_numbers.gather_dims_to_operand_dims(0)) {
+  if (dim_numbers.collapsed_slice_dims_size() != 1 ||
+      dim_numbers.collapsed_slice_dims(0) != dim_numbers.start_index_map(0)) {
     VLOG(3) << "ComputeArrayForGather: gather operations must elide "
-               "gather_dims_to_operand_dims[0] and "
-               "gather_dims_to_operand_dims[0] only";
+               "start_index_map[0] and "
+               "start_index_map[0] only";
     return nullptr;
   }
 
@@ -277,21 +276,21 @@
   // arrays from an array of size [7,4,6].  We check that condition down below:
 
   for (int64 i = 0, e = source->shape().dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.elided_window_dims(0) &&
-        source->shape().dimensions(i) != window_bounds[i]) {
-      VLOG(3) << "ComputeArrayForGather: window_bounds[" << i
+    if (i != dim_numbers.collapsed_slice_dims(0) &&
+        source->shape().dimensions(i) != slice_sizes[i]) {
+      VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i
               << "] != source->shape().dimensions(" << i << ") -- "
-              << source->shape().dimensions(i) << " vs. " << window_bounds[i]
-              << " with dim_numbers.elided_window_dims(0) = "
-              << dim_numbers.elided_window_dims(0);
+              << source->shape().dimensions(i) << " vs. " << slice_sizes[i]
+              << " with dim_numbers.collapsed_slice_dims(0) = "
+              << dim_numbers.collapsed_slice_dims(0);
       return nullptr;
     }
   }
 
-  int64 source_dim = dim_numbers.gather_dims_to_operand_dims(0);
+  int64 source_dim = dim_numbers.start_index_map(0);
   std::vector<int64> output_dims;
   for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) {
-    if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+    if (!c_binary_search(dim_numbers.offset_dims(), i)) {
       output_dims.push_back(i);
     }
   }
@@ -447,7 +446,7 @@
 
   int64 indexed_source_subarray_size =
       std::accumulate(operand_shape.begin() + source_passthrough_dim + 1,
-                      operand_shape.end(), 1, std::multiplies<int64>());
+                      operand_shape.end(), 1LL, std::multiplies<int64>());
 
   return FindSuffixWithProduct(result_shape, indexed_source_subarray_size);
 }
@@ -735,11 +734,11 @@
   //   operand = s32[3,5,2] constant({...})
   //   indices = s32[7] parameter(0)
   //   gather = s32[3,2,7] gather(operand, indices),
-  //       output_window_dims={0,1},
-  //       elided_window_dims={1},
-  //       gather_dims_to_operand_dims={1},
+  //       offset_dims={0,1},
+  //       collapsed_slice_dims={1},
+  //       start_index_map={1},
   //       index_vector_dim=1,
-  //       window_bounds={3,1,2}
+  //       slice_sizes={3,1,2}
   //   reshape = s32[6,7] reshape(gather)
   //
   // In this case the gather maps to:
@@ -764,7 +763,7 @@
       &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node,
       scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim()));
 
-  CHECK_EQ(c_accumulate(new_scalar_indexed_source_shape, 1l,
+  CHECK_EQ(c_accumulate(new_scalar_indexed_source_shape, 1LL,
                         std::multiplies<int64>()),
            ShapeUtil::ElementsIn(scalar_indexed_source_shape));
 
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
index e923dc3..675eb31 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.h
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -265,7 +265,7 @@
 
   StatusOr<Array*> ComputeArrayForGather(
       const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+      tensorflow::gtl::ArraySlice<int64> slice_sizes, Array* source,
       Array* indices);
 
   StatusOr<Array*> ComputeArrayForDotWithIndexedLhs(
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 5f4b427..97052ed 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -82,11 +82,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -102,11 +102,11 @@
   operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5] parameter(0)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -122,11 +122,11 @@
   operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5,2] parameter(0)
   ROOT gather = s32[5] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
 
@@ -141,11 +141,11 @@
   operand = s32[3,3,1] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,2},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0,2},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3,1}
+      slice_sizes={1,3,1}
 }
 )";
 
@@ -160,11 +160,11 @@
   operand = s32[3,3,1] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,2,3] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,2},
+      collapsed_slice_dims={2},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={2,3,1}
+      slice_sizes={2,3,1}
 }
 )";
 
@@ -179,11 +179,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,2}
+      slice_sizes={1,2}
 }
 )";
 
@@ -199,17 +199,17 @@
   indices_a = s32[5] parameter(0)
   indices_b = s32[2] parameter(1)
   gather_a = s32[5,3] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
   ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -228,17 +228,17 @@
   indices_a = s32[5,7] parameter(1)
   indices_b = s32[2] parameter(2)
   gather_a = s32[5,3,7] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
-      output_window_dims={0,1},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={0,1},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=1,
-      window_bounds={5,3,1}
+      slice_sizes={5,3,1}
 }
 )";
 
@@ -256,17 +256,17 @@
   indices_a = s32[2] parameter(1)
   indices_b = s32[5,7] parameter(2)
   gather_a = s32[2,6] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,6}
+      slice_sizes={1,6}
 }
 )";
 
@@ -284,17 +284,17 @@
   indices_a = s32[5,7] parameter(1)
   indices_b = s32[4,8] parameter(2)
   gather_a = s32[5,3,7] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
-      output_window_dims={1,2},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={1,2},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=2,
-      window_bounds={5,3,1}
+      slice_sizes={5,3,1}
 }
 )";
 
@@ -312,11 +312,11 @@
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2] reshape(gather)
 }
 )";
@@ -333,11 +333,11 @@
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,4,7] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2,7] reshape(gather)
 }
 )";
@@ -358,11 +358,11 @@
       {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,2,6,7] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,2},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,2,6}
+      slice_sizes={1,2,6}
   ROOT reshape = s32[5,3,4,7] reshape(gather)
 }
 )";
@@ -381,11 +381,11 @@
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,6] reshape(gather)
 }
 )";
@@ -408,14 +408,14 @@
   operand = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 1, 2, 3 } })
 
   i.0 = s64[1,3]{1,0} parameter(0)
-  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), output_window_dims={2},
-    elided_window_dims={0}, gather_dims_to_operand_dims={0},
-    index_vector_dim=2, window_bounds={1,3}
+  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=2, slice_sizes={1,3}
 
   i.1 = s64[1] parameter(1)
-  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), output_window_dims={0,2},
-    elided_window_dims={1}, gather_dims_to_operand_dims={1},
-    index_vector_dim=1, window_bounds={1,1,3}
+  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), offset_dims={0,2},
+    collapsed_slice_dims={1}, start_index_map={1},
+    index_vector_dim=1, slice_sizes={1,1,3}
 
   ROOT reshape = s32[1,3]{1,0} reshape(g.1)
 }
@@ -441,11 +441,11 @@
   operand = s32[1,6] constant(s32[1,6]{{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,6] reshape(gather)
 }
 )";
@@ -469,11 +469,11 @@
       {1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[1] parameter(0)
   gather = s32[1,1,6] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1,2},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={1,1,6}
+      slice_sizes={1,1,6}
   ROOT reshape = s32[1,1,1,6] reshape(gather)
 }
 )";
@@ -500,11 +500,11 @@
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1,5] parameter(0)
   gather = s32[1,5,6] gather(operand, indices),
-      output_window_dims={2},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={2},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,5,6] reshape(gather)
 }
 )";
@@ -530,11 +530,11 @@
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2,2,3] reshape(gather)
 }
 )";
@@ -562,11 +562,11 @@
       {{1,2},{3,4},{5,6},{7,8},{9,10}}})
   indices = s32[7] parameter(0)
   gather = s32[3,2,7] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0,1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1,2}
+      slice_sizes={3,1,2}
   ROOT reshape = s32[6,7] reshape(gather)
 }
 )";
@@ -594,11 +594,11 @@
     {{1},{2},{3},{4}}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6,1] gather(operand, indices),
-      output_window_dims={1,3},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,3},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4,1}
+      slice_sizes={1,4,1}
   ROOT reshape = s32[5,2,2,2,3,1] reshape(gather)
 }
 )";
@@ -623,11 +623,11 @@
   operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   indices = s32[5] parameter(0)
   gather = f32[5,4] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT tanh = f32[5,4] tanh(gather)
 }
 )";
@@ -650,11 +650,11 @@
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -678,11 +678,11 @@
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT sub = s32[5,4] subtract(gather, constant_broadcasted)
 }
 )";
@@ -706,11 +706,11 @@
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT sub = s32[5,4] subtract(constant_broadcasted, gather)
 }
 )";
@@ -733,11 +733,11 @@
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -760,11 +760,11 @@
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -808,11 +808,11 @@
   dot_rhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
@@ -835,11 +835,11 @@
   dot_rhs_constant = s32[3,3] constant(s32[3,3]{{1,2,3},{4,5,6},{7,8,9}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[3,5] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 )";
@@ -863,11 +863,11 @@
   dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[3,5] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
@@ -892,11 +892,11 @@
   dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[5,3] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
   ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 )";
@@ -921,11 +921,11 @@
   dot_lhs_constant = s32[2,2,3] constant(s32[2,2,3]{{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
   indices = s32[4] parameter(0)
   dot_rhs = s32[2,3,4] gather(gather_operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={0,1},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=1,
-      window_bounds={2,3,1}
+      slice_sizes={2,3,1}
   ROOT dot = s32[2,2,4] dot(dot_lhs_constant, dot_rhs),
       lhs_contracting_dims={2}, rhs_contracting_dims={1},
       lhs_batch_dims={0}, rhs_batch_dims={0}
@@ -952,11 +952,11 @@
   dot_rhs_constant = s32[2,3] constant(s32[2,3]{{1,2,3},{4,5,6}})
   indices = s32[2] parameter(0)
   dot_lhs = s32[3,2] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[3,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index e2191ae..f33942d 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -120,6 +120,7 @@
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
     case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllToAll:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDivide:
     case HloOpcode::kDomain:
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 9b10902..db6b910 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -104,7 +104,7 @@
   }
 
   // No "synchronize all activity" implemented for this platform at the moment.
-  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronizeAllActivity() override { return true; }
   bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override {
     return false;
   }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index b5a9d6e..805fdb2 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1563,7 +1563,7 @@
   // and the computation result. The latter two are specified in
   // computation_layout, so we only need to keep the existing layouts for
   // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-  // layout assignment pass that may accidently use the existing layout.
+  // layout assignment pass that may accidentally use the existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kBitcast) {
       // bitcasts are inherently layout sensitive and so a bitcast instruction
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index ad3b662..ccb9fb3 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -76,9 +76,13 @@
 TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto rng0 = builder.AddInstruction(
-      HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}),
-                                RandomDistribution::RNG_UNIFORM, {}));
+  auto rng0 = builder.AddInstruction(HloInstruction::CreateRng(
+      ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}),
+      RandomDistribution::RNG_UNIFORM,
+      {builder.AddInstruction(
+           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
+       builder.AddInstruction(HloInstruction::CreateConstant(
+           LiteralUtil::CreateR0<float>(1.0f)))}));
   auto reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0));
 
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
new file mode 100644
index 0000000..45ca731
--- /dev/null
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -0,0 +1,350 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+using tensorflow::gtl::ArraySlice;
+
+// Transposes the given scatter_indices such that the index_vector_dim becomes
+// the most-minor dimension.
+static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
+    HloInstruction* scatter_indices, int64 index_vector_dim) {
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+
+  if (scatter_indices_shape.dimensions_size() == index_vector_dim) {
+    return scatter_indices;
+  }
+
+  if (index_vector_dim == (scatter_indices_shape.dimensions_size() - 1)) {
+    return scatter_indices;
+  }
+
+  std::vector<int64> permutation;
+  permutation.reserve(scatter_indices_shape.dimensions_size());
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != index_vector_dim) {
+      permutation.push_back(i);
+    }
+  }
+  permutation.push_back(index_vector_dim);
+  return MakeTransposeHlo(scatter_indices, permutation);
+}
+
+// Canonicalizes the scatter_indices tensor in order to keep them uniform while
+// performing the scatter operation.
+static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
+    HloInstruction* scatter_indices, int64 index_vector_dim) {
+  // Transpose the non-index-vector dimensions to the front.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * transposed_scatter_indices,
+      TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
+  bool indices_are_scalar =
+      index_vector_dim == scatter_indices->shape().dimensions_size();
+
+  // The number of dimensions in scatter_indices that are index dimensions.
+  const int64 index_dims_in_scatter_indices = indices_are_scalar ? 0 : 1;
+
+  // If there is only one index (i.e. scatter_indices has rank 1 and this
+  // scatter is really just a dynamic update slice) add a leading degenerate
+  // dimension for uniformity.  Otherwise create a "collapsed" leading dimension
+  // that subsumes all of the non-index-vector dimensions.
+  const Shape& shape = transposed_scatter_indices->shape();
+  if (shape.dimensions_size() == index_dims_in_scatter_indices) {
+    return PrependDegenerateDims(transposed_scatter_indices, 1);
+  } else {
+    // Collapse all but the dimensions (0 or 1) in scatter_indices containing
+    // the index vectors.
+    return CollapseFirstNDims(
+        transposed_scatter_indices,
+        shape.dimensions_size() - index_dims_in_scatter_indices);
+  }
+}
+
+// Permutes the `updates` tensor such that all the scatter dims appear in the
+// major dimensions and all the window dimensions appear in the minor
+// dimensions.
+static StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
+    HloInstruction* updates, ArraySlice<int64> update_window_dims) {
+  std::vector<int64> permutation;
+  const int64 updates_rank = ShapeUtil::Rank(updates->shape());
+  permutation.reserve(updates_rank);
+
+  for (int64 i = 0; i < updates_rank; ++i) {
+    bool is_scatter_dim = !c_binary_search(update_window_dims, i);
+    if (is_scatter_dim) {
+      permutation.push_back(i);
+    }
+  }
+  for (auto window_dim : update_window_dims) {
+    permutation.push_back(window_dim);
+  }
+
+  return MakeTransposeHlo(updates, permutation);
+}
+
+// Expands or contracts the scatter indices in the updates tensor.
+static StatusOr<HloInstruction*> AdjustScatterDims(
+    const Shape& scatter_indices_shape, HloInstruction* updates,
+    int64 index_vector_dim) {
+  int64 num_scatter_dims = scatter_indices_shape.dimensions_size();
+  if (index_vector_dim < scatter_indices_shape.dimensions_size()) {
+    --num_scatter_dims;
+  }
+  if (num_scatter_dims == 0) {
+    // If there are no scatter dims, this must be a dynamic-update-slice kind of
+    // scatter. In this case, we prepend a degenerate dimension to work
+    // uniformly in the while loop.
+    return PrependDegenerateDims(updates, 1);
+  }
+  return CollapseFirstNDims(updates, num_scatter_dims);
+}
+
+// Expands an index vector from the scatter_indices tensor into a vector that
+// can be used to dynamic-update-slice to perform the scatter update.
+static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
+    HloInstruction* index_vector, const ScatterDimensionNumbers& dim_numbers,
+    int64 operand_rank) {
+  HloComputation* computation = index_vector->parent();
+  const Shape& index_shape = index_vector->shape();
+  HloInstruction* zero =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1})));
+
+  // We extract out individual components from the smaller index and concatenate
+  // them (interspersing zeros as needed) into the larger index.
+  std::vector<HloInstruction*> expanded_index_components;
+
+  for (int i = 0; i < operand_rank; i++) {
+    int64 index_vector_dim_index =
+        FindIndex(dim_numbers.scatter_dims_to_operand_dims(), i);
+    if (index_vector_dim_index !=
+        dim_numbers.scatter_dims_to_operand_dims_size()) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * component_to_concat,
+          MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
+                       /*limit_indices=*/{index_vector_dim_index + 1},
+                       /*strides=*/{1}));
+      expanded_index_components.push_back(component_to_concat);
+    } else {
+      expanded_index_components.push_back(zero);
+    }
+  }
+
+  return MakeConcatHlo(expanded_index_components, /*dimension=*/0);
+}
+
+// Body of the while loop that performs the scatter operation using other HLOs.
+static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
+    HloInstruction* scatter, HloInstruction* induction_var,
+    const std::vector<HloInstruction*>& loop_state) {
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+  CHECK_EQ(loop_state.size(), 3);
+  HloInstruction* operand = loop_state[0];
+  HloInstruction* scatter_indices = loop_state[1];
+  HloInstruction* updates = loop_state[2];
+
+  bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               scatter->operand(1)->shape().dimensions_size());
+
+  // Build a vector form of the induction variable of the while loop.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * induction_var_as_vector,
+      MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
+                       /*result_shape_bounds=*/{1}));
+
+  // Pick the index to scatter from scatter_indices based on the induction_var
+  // and transform that to an index into the `operand` space.
+  HloInstruction* index_vector;
+  if (has_scalar_indices) {
+    TF_ASSIGN_OR_RETURN(
+        index_vector,
+        MakeDynamicSliceHlo(scatter_indices, induction_var_as_vector, {1}));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_into_scatter_indices,
+        PadVectorWithZeros(induction_var_as_vector,
+                           /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
+    int index_vector_size = scatter_indices->shape().dimensions(1);
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_vector_2d,
+        MakeDynamicSliceHlo(scatter_indices, index_into_scatter_indices,
+                            {1, index_vector_size}));
+    TF_ASSIGN_OR_RETURN(index_vector,
+                        ElideDegenerateDims(index_vector_2d, {0}));
+  }
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * scatter_slice_start,
+      ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers,
+                                        operand->shape().dimensions_size()));
+
+  // Extract the slice to be used to update from `updates` tensor for the
+  // induction_var corresponding to this iteration of the while loop.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * index_into_updates,
+      PadVectorWithZeros(
+          induction_var_as_vector, /*zeros_to_prepend=*/0,
+          /*zeros_to_append=*/updates->shape().dimensions_size() - 1));
+  std::vector<int64> update_slice_bounds(updates->shape().dimensions().begin(),
+                                         updates->shape().dimensions().end());
+  update_slice_bounds[0] = 1;
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * update_slice,
+      MakeDynamicSliceHlo(updates, index_into_updates, update_slice_bounds));
+  TF_ASSIGN_OR_RETURN(HloInstruction * update_slice_for_scatter,
+                      ElideDegenerateDims(update_slice, {0}));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * update_slice_with_dims_inserted,
+      InsertDegenerateDims(update_slice_for_scatter,
+                           AsInt64Slice(dim_numbers.inserted_window_dims())));
+
+  // Extact the slice to update from `operand` tensor.
+  const Shape& update_slice_shape = update_slice_with_dims_inserted->shape();
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * operand_slice_to_update,
+      MakeDynamicSliceHlo(operand, scatter_slice_start,
+                          AsInt64Slice(update_slice_shape.dimensions())));
+
+  // Compute the new value for the slice to be updated in `operand` tensor by
+  // combining the existing value and the update value using the update
+  // computation.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * updated_operand_slice,
+      MakeMapHlo({operand_slice_to_update, update_slice_with_dims_inserted},
+                 scatter->to_apply()));
+
+  // Write the updated value of the slice into `operand` tensor.
+  TF_ASSIGN_OR_RETURN(HloInstruction * updated_operand,
+                      MakeDynamicUpdateSliceHlo(operand, updated_operand_slice,
+                                                scatter_slice_start));
+
+  return StatusOr<std::vector<HloInstruction*>>{
+      {updated_operand, scatter_indices, updates}};
+}
+
+// High Level Algorithm.
+//
+// 1. Canonicalize the scatter_indices tensor such that it has rank 2, where
+//    each row is an index into the operand.
+// 2. Canonicalize the updates tensor such that is has rank `num_window_dims+1`
+//    and the scatter dim is the most-major dimension.
+// 3. Iterate over the set of indices in the canonicalized scatter_indices
+//    tensor using a while loop, updating the operand for each such index. Each
+//    iteration of this while loop performs the following:
+//      a. Pick the index from scatter_indices for this iteration.
+//      b. Transfrom this index into an index into the operand space.
+//      c. Extract the slice to be used to update from the updates tensor.
+//      d. Extract the slice to update from the operand tensor.
+//      e. Compute the new value for the slice to update by combining the slices
+//         from c. and d. using the update_computation of scatter.
+//      f. Write the updated value of the slice into the operand tensor.
+
+StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
+    HloInstruction* scatter) {
+  HloInstruction* operand = scatter->mutable_operand(0);
+  HloInstruction* scatter_indices = scatter->mutable_operand(1);
+  HloInstruction* updates = scatter->mutable_operand(2);
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+
+  // If the updates tensor is empty, there is no need to update the operand. We
+  // can return the operand as is.
+  if (ShapeUtil::IsZeroElementArray(updates->shape())) {
+    return operand;
+  }
+
+  // Compute the trip count for the while loop to be used for scatter. This
+  // should be the number of indices we should scatter into the operand.
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+  int64 scatter_loop_trip_count = 1;
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
+    }
+  }
+  if (!IsInt32(scatter_loop_trip_count)) {
+    return Unimplemented(
+        "Scatter operations with more than 2147483647 scatter indices are not "
+        "supported. This error occurred for %s.",
+        scatter->ToString().c_str());
+  }
+
+  // Canonicalize the scatter_indices, after which the size of its most-major
+  // dimension must be same as the while loop trip count.
+  TF_ASSIGN_OR_RETURN(HloInstruction * canonical_scatter_indices,
+                      CanonicalizeScatterIndices(
+                          scatter_indices, dim_numbers.index_vector_dim()));
+  CHECK_EQ(scatter_loop_trip_count,
+           canonical_scatter_indices->shape().dimensions(0));
+
+  // Canonicalize the updates, after which the size of its most-major dimension
+  // must be same as the while loop trip count.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * canonical_updates,
+      PermuteScatterAndWindowDims(
+          updates, AsInt64Slice(dim_numbers.update_window_dims())));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * adjusted_canonical_updates,
+      AdjustScatterDims(scatter_indices->shape(), canonical_updates,
+                        dim_numbers.index_vector_dim()));
+  CHECK_EQ(scatter_loop_trip_count,
+           adjusted_canonical_updates->shape().dimensions(0));
+
+  // The while loop that implements the scatter operation.
+  StatusOr<std::vector<HloInstruction*>> scatter_loop_result_status =
+      WhileUtil::MakeCountedLoop(
+          scatter->parent(), scatter_loop_trip_count,
+          {operand, canonical_scatter_indices, adjusted_canonical_updates},
+          [&](HloInstruction* induction_var,
+              const std::vector<HloInstruction*>& loop_state) {
+            return ScatterLoopBody(scatter, induction_var, loop_state);
+          });
+  TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> scatter_loop_result,
+                      scatter_loop_result_status);
+  return scatter_loop_result.front();
+}
+
+StatusOr<bool> ScatterExpander::Run(HloModule* module) {
+  std::vector<HloInstruction*> scatter_instrs;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() == HloOpcode::kScatter) {
+        scatter_instrs.push_back(instr);
+      }
+    }
+  }
+
+  for (auto instr : scatter_instrs) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(instr));
+    TF_RETURN_IF_ERROR(
+        instr->parent()->ReplaceInstruction(instr, expanded_root));
+  }
+
+  return !scatter_instrs.empty();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h
new file mode 100644
index 0000000..8f735e8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+class ScatterExpander : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "scatter_expander"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<HloInstruction*> ExpandScatter(HloInstruction* scatter);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 212db06..1dbf540 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -53,6 +53,7 @@
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
@@ -408,7 +409,7 @@
       streams.push_back(std::move(stream));
 
       if (replica == 0 && profile != nullptr) {
-        timers.emplace_back(new se::Timer(streams.back()->parent()));
+        timers.push_back(MakeUnique<se::Timer>(streams.back()->parent()));
         streams.back()
             ->InitTimer(timers.back().get())
             .ThenStartTimer(timers.back().get());
@@ -440,7 +441,7 @@
         streams.back()->ThenStopTimer(timers.back().get());
       }
 
-      result_buffers.emplace_back(std::move(result));
+      result_buffers.push_back(std::move(result));
     }
     TF_ASSIGN_OR_RETURN(GlobalDataHandle handle,
                         allocation_tracker_.RegisterReplicatedBuffers(
@@ -558,7 +559,7 @@
   std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
       replicated_arguments;
   for (const auto& arg : arguments) {
-    replicated_arguments.emplace_back(arg);
+    replicated_arguments.push_back(arg);
   }
 
   TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
@@ -1052,11 +1053,12 @@
     executor = replicas[arg->replica_id()];
   }
 
-  Literal literal;
+  auto literal = Literal::CreateFromShape(arg->shape_with_layout());
+
   TF_RETURN_IF_ERROR(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
-          executor, arg->shape_with_layout(), &literal));
-  *result->mutable_literal() = literal.ToProto();
+          executor, arg->shape_with_layout(), *literal));
+  *result->mutable_literal() = literal->ToProto();
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index c888bbf..cc1ec17 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1530,7 +1530,7 @@
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvolveShape(
     const Shape& lhs, const Shape& rhs, const Window& window,
-    const ConvolutionDimensionNumbers& dnums) {
+    const ConvolutionDimensionNumbers& dnums, int64 feature_group_count) {
   TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of convolution"));
   TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of convolution"));
 
@@ -1640,12 +1640,13 @@
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
-  if (input_features != kernel_input_features) {
+  if (input_features != kernel_input_features * feature_group_count) {
     return InvalidArgument(
         "Expected LHS feature dimension (value %lld) to match RHS "
-        "input feature dimension (value %lld); got <conv>(%s, %s)\n"
+        "input feature dimension * feature_group_count (value %lld); "
+        "got <conv>(%s, %s)\n"
         "Dimension numbers: {%s}.",
-        input_features, kernel_input_features,
+        input_features, kernel_input_features * feature_group_count,
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str(), dnums.DebugString().c_str());
   }
@@ -1779,6 +1780,51 @@
   return ShapeUtil::MakeTupleShape(operand_shape_values);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferAllToAllShape(
+    const Shape& shape, int64 split_dimension, int64 concat_dimension,
+    int64 split_count) {
+  TF_RET_CHECK(split_count > 0);
+  if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) {
+    return InvalidArgument(
+        "AllToAll split_dimension %lld is out-of-bounds in shape %s.",
+        split_dimension, ShapeUtil::HumanString(shape).c_str());
+  }
+  if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) {
+    return InvalidArgument(
+        "AllToAll concat_dimension %lld is out-of-bounds in shape %s.",
+        concat_dimension, ShapeUtil::HumanString(shape).c_str());
+  }
+  if (shape.dimensions(split_dimension) % split_count != 0) {
+    return InvalidArgument(
+        "AllToAll split dimension size %lld must be dividable by split_count "
+        "%lld.",
+        shape.dimensions(split_dimension), split_count);
+  }
+  std::vector<int64> new_dimensions(shape.dimensions().begin(),
+                                    shape.dimensions().end());
+  new_dimensions[split_dimension] /= split_count;
+  new_dimensions[concat_dimension] *= split_count;
+  return ShapeUtil::MakeShape(shape.element_type(), new_dimensions);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferAllToAllTupleShape(
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
+  // An Alltoall HLO instruction receives N operands (with the same shape) and
+  // returns a tuple that contains N array shapes.
+  TF_RET_CHECK(!operand_shapes.empty());
+  for (int i = 0; i < operand_shapes.size(); i++) {
+    if (!ShapeUtil::Equal(*operand_shapes[0], *operand_shapes[i])) {
+      return InvalidArgument(
+          "HLO all-to-all has operands with different shapes: the 0th "
+          "operand shape %s, but the %dth operand has shape %s.",
+          ShapeUtil::HumanString(*operand_shapes[0]).c_str(), i,
+          ShapeUtil::HumanString(*operand_shapes[i]).c_str());
+    }
+  }
+
+  return InferVariadicOpShape(HloOpcode::kTuple, operand_shapes);
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReduceShape(
     tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
@@ -2446,201 +2492,196 @@
 
 static Status ValidateGatherDimensionNumbers(
     const Shape& input_shape,
-    tensorflow::gtl::ArraySlice<int64> gather_indices_shape,
+    tensorflow::gtl::ArraySlice<int64> start_indices_shape,
     const GatherDimensionNumbers& dim_numbers) {
-  if (!c_is_sorted(dim_numbers.output_window_dims())) {
+  if (!c_is_sorted(dim_numbers.offset_dims())) {
     return InvalidArgument(
         "Output window dimensions in gather op must be ascending; got: %s.",
-        Join(dim_numbers.output_window_dims(), ", ").c_str());
+        Join(dim_numbers.offset_dims(), ", ").c_str());
   }
 
-  if (c_adjacent_find(dim_numbers.output_window_dims()) !=
-      dim_numbers.output_window_dims().end()) {
+  if (c_adjacent_find(dim_numbers.offset_dims()) !=
+      dim_numbers.offset_dims().end()) {
     return InvalidArgument(
         "Output window dimensions in gather op must not repeat; got: %s.",
-        Join(dim_numbers.output_window_dims(), ", ").c_str());
+        Join(dim_numbers.offset_dims(), ", ").c_str());
   }
 
-  const int64 output_window_dim_count = dim_numbers.output_window_dims_size();
+  const int64 output_offset_dim_count = dim_numbers.offset_dims_size();
   const int64 output_shape_rank =
-      output_window_dim_count + gather_indices_shape.size() - 1;
+      output_offset_dim_count + start_indices_shape.size() - 1;
 
-  for (int i = 0; i < dim_numbers.output_window_dims_size(); ++i) {
-    int64 window_index = dim_numbers.output_window_dims(i);
-    if (window_index < 0 || window_index >= output_shape_rank) {
+  for (int i = 0; i < dim_numbers.offset_dims_size(); ++i) {
+    int64 offset_dim = dim_numbers.offset_dims(i);
+    if (offset_dim < 0 || offset_dim >= output_shape_rank) {
       return InvalidArgument(
-          "Window index %d in gather op is out of bounds; got %lld, but should "
+          "Offset dimension %d in gather op is out of bounds; got %lld, but "
+          "should "
           "have been in [0,%lld).",
-          i, window_index, output_shape_rank);
+          i, offset_dim, output_shape_rank);
     }
   }
 
-  if (dim_numbers.gather_dims_to_operand_dims_size() !=
-      gather_indices_shape[dim_numbers.index_vector_dim()]) {
+  if (dim_numbers.start_index_map_size() !=
+      start_indices_shape[dim_numbers.index_vector_dim()]) {
     return InvalidArgument(
-        "Gather op has %d elements in gather_dims_to_operand_dims and the "
-        "bound of dimension index_vector_dim=%lld of gather_indices is "
+        "Gather op has %d elements in start_index_map and the "
+        "bound of dimension index_vector_dim=%lld of start_indices is "
         "%lld. These two numbers must be equal.",
-        dim_numbers.gather_dims_to_operand_dims_size(),
-        dim_numbers.index_vector_dim(),
-        gather_indices_shape[dim_numbers.index_vector_dim()]);
+        dim_numbers.start_index_map_size(), dim_numbers.index_vector_dim(),
+        start_indices_shape[dim_numbers.index_vector_dim()]);
   }
 
-  for (int i = 0; i < dim_numbers.gather_dims_to_operand_dims_size(); i++) {
-    int64 gather_dim_to_input_dim = dim_numbers.gather_dims_to_operand_dims(i);
-    if (gather_dim_to_input_dim < 0 ||
-        gather_dim_to_input_dim >= input_shape.dimensions_size()) {
+  for (int i = 0; i < dim_numbers.start_index_map_size(); i++) {
+    int64 operand_dim_for_start_index_i = dim_numbers.start_index_map(i);
+    if (operand_dim_for_start_index_i < 0 ||
+        operand_dim_for_start_index_i >= input_shape.dimensions_size()) {
       return InvalidArgument(
-          "Invalid gather_dims_to_operand_dims mapping; domain is [0, %d), "
-          "got: %d->%lld.",
-          input_shape.dimensions_size(), i, gather_dim_to_input_dim);
+          "Invalid start_index_map; domain is [0, %d), got: %d->%lld.",
+          input_shape.dimensions_size(), i, operand_dim_for_start_index_i);
     }
   }
 
-  std::vector<int64> sorted_gather_dims_to_operand_dims(
-      dim_numbers.gather_dims_to_operand_dims().begin(),
-      dim_numbers.gather_dims_to_operand_dims().end());
+  std::vector<int64> sorted_start_index_map(
+      dim_numbers.start_index_map().begin(),
+      dim_numbers.start_index_map().end());
 
-  c_sort(sorted_gather_dims_to_operand_dims);
+  c_sort(sorted_start_index_map);
 
-  if (c_adjacent_find(sorted_gather_dims_to_operand_dims) !=
-      sorted_gather_dims_to_operand_dims.end()) {
+  if (c_adjacent_find(sorted_start_index_map) != sorted_start_index_map.end()) {
     return InvalidArgument(
-        "Repeated dimensions are not allowed in gather_dims_to_operand_dims; "
+        "Repeated dimensions are not allowed in start_index_map; "
         "got: %s.",
-        Join(dim_numbers.gather_dims_to_operand_dims(), ", ").c_str());
+        Join(dim_numbers.start_index_map(), ", ").c_str());
   }
 
-  for (int64 elided_dim : dim_numbers.elided_window_dims()) {
-    if (elided_dim < 0 || elided_dim >= input_shape.dimensions_size()) {
+  for (int64 collapsed_dim : dim_numbers.collapsed_slice_dims()) {
+    if (collapsed_dim < 0 || collapsed_dim >= input_shape.dimensions_size()) {
       return InvalidArgument(
-          "Invalid elided_window_dims set in gather op; valid range is [0, "
+          "Invalid collapsed_slice_dims set in gather op; valid range is [0, "
           "%d), got: %lld.",
-          input_shape.dimensions_size(), elided_dim);
+          input_shape.dimensions_size(), collapsed_dim);
     }
   }
 
-  if (!c_is_sorted(dim_numbers.elided_window_dims())) {
+  if (!c_is_sorted(dim_numbers.collapsed_slice_dims())) {
     return InvalidArgument(
-        "elided_window_dims in gather op must be sorted; got: %s",
-        Join(dim_numbers.elided_window_dims(), ", ").c_str());
+        "collapsed_slice_dims in gather op must be sorted; got: %s",
+        Join(dim_numbers.collapsed_slice_dims(), ", ").c_str());
   }
 
-  if (c_adjacent_find(dim_numbers.elided_window_dims()) !=
-      dim_numbers.elided_window_dims().end()) {
+  if (c_adjacent_find(dim_numbers.collapsed_slice_dims()) !=
+      dim_numbers.collapsed_slice_dims().end()) {
     return InvalidArgument(
-        "Repeated dimensions not allowed in elided_window_dims in gather op; "
+        "Repeated dimensions not allowed in collapsed_slice_dims in gather op; "
         "got: %s.",
-        Join(dim_numbers.elided_window_dims(), ", ").c_str());
+        Join(dim_numbers.collapsed_slice_dims(), ", ").c_str());
   }
 
   return Status::OK();
 }
 
 /*static*/ StatusOr<Shape> ShapeInference::InferGatherShape(
-    const Shape& input_shape, const Shape& gather_indices_shape,
+    const Shape& input_shape, const Shape& start_indices_shape,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
   TF_RETURN_IF_ERROR(
       ExpectArray(input_shape, "input tensor operand gather op"));
   TF_RETURN_IF_ERROR(
-      ExpectArray(gather_indices_shape, "gather indices operand of gather op"));
+      ExpectArray(start_indices_shape, "gather indices operand of gather op"));
 
-  if (!ShapeUtil::ElementIsIntegral(gather_indices_shape)) {
+  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
     return InvalidArgument(
         "Gather indices parameter must be an integral tensor; got %s.",
-        ShapeUtil::HumanString(gather_indices_shape).c_str());
+        ShapeUtil::HumanString(start_indices_shape).c_str());
   }
 
   // We implicitly reshape gather indices of shape P[A,B,C] to P[A,B,C,1] if
   // index_vector_dim is rank(P).  The bounds of this expanded shape is
-  // stored in expanded_gather_indices_shape.
+  // stored in expanded_start_indices_shape.
 
-  if (gather_indices_shape.dimensions_size() <
+  if (start_indices_shape.dimensions_size() <
           gather_dim_numbers.index_vector_dim() ||
       gather_dim_numbers.index_vector_dim() < 0) {
     return InvalidArgument(
-        "Gather index leaf dimension must be within [0, rank(gather_indices) + "
-        "1). rank(gather_indices) is %d and gather index leaf dimension is "
+        "Gather index leaf dimension must be within [0, rank(start_indices) + "
+        "1). rank(start_indices) is %d and gather index leaf dimension is "
         "%lld.",
-        gather_indices_shape.dimensions_size(),
+        start_indices_shape.dimensions_size(),
         gather_dim_numbers.index_vector_dim());
   }
 
-  std::vector<int64> expanded_gather_indices_shape;
-  expanded_gather_indices_shape.reserve(gather_indices_shape.dimensions_size());
-  c_copy(gather_indices_shape.dimensions(),
-         std::back_inserter(expanded_gather_indices_shape));
-  if (expanded_gather_indices_shape.size() ==
+  std::vector<int64> expanded_start_indices_shape;
+  expanded_start_indices_shape.reserve(start_indices_shape.dimensions_size());
+  c_copy(start_indices_shape.dimensions(),
+         std::back_inserter(expanded_start_indices_shape));
+  if (expanded_start_indices_shape.size() ==
       gather_dim_numbers.index_vector_dim()) {
-    expanded_gather_indices_shape.push_back(1);
+    expanded_start_indices_shape.push_back(1);
   }
 
   TF_RETURN_IF_ERROR(ValidateGatherDimensionNumbers(
-      input_shape, expanded_gather_indices_shape, gather_dim_numbers));
+      input_shape, expanded_start_indices_shape, gather_dim_numbers));
 
-  if (window_bounds.size() != input_shape.dimensions_size()) {
+  if (slice_sizes.size() != input_shape.dimensions_size()) {
     return InvalidArgument(
-        "Gather op must have one window bound for every input dimension; got: "
-        "len(window_bounds)=%lu, input_shape.rank=%d.",
-        window_bounds.size(), input_shape.dimensions_size());
+        "Gather op must have one slice size for every input dimension; got: "
+        "len(slice_sizes)=%lu, input_shape.rank=%d.",
+        slice_sizes.size(), input_shape.dimensions_size());
   }
 
-  if (window_bounds.size() !=
-      gather_dim_numbers.output_window_dims_size() +
-          gather_dim_numbers.elided_window_dims_size()) {
+  if (slice_sizes.size() !=
+      gather_dim_numbers.offset_dims_size() +
+          gather_dim_numbers.collapsed_slice_dims_size()) {
     return InvalidArgument(
-        "All components of the window index in a gather op must either be a "
-        "output window index or explicitly elided; got len(window_bounds)=%lu, "
-        "output_window_bounds=%s, elided_window_bounds=%s.",
-        window_bounds.size(),
-        Join(gather_dim_numbers.output_window_dims(), ",").c_str(),
-        Join(gather_dim_numbers.elided_window_dims(), ",").c_str());
+        "All components of the offset index in a gather op must either be a "
+        "offset dimension or explicitly collapsed; got len(slice_sizes)=%lu, "
+        "output_slice_sizes=%s, collapsed_slice_dims=%s.",
+        slice_sizes.size(), Join(gather_dim_numbers.offset_dims(), ",").c_str(),
+        Join(gather_dim_numbers.collapsed_slice_dims(), ",").c_str());
   }
 
-  for (int i = 0; i < window_bounds.size(); i++) {
-    int64 window_bound = window_bounds[i];
-    int64 corresponding_input_bound = input_shape.dimensions(i);
-    if (window_bound < 0 || window_bound > corresponding_input_bound) {
+  for (int i = 0; i < slice_sizes.size(); i++) {
+    int64 slice_size = slice_sizes[i];
+    int64 corresponding_input_size = input_shape.dimensions(i);
+    if (slice_size < 0 || slice_size > corresponding_input_size) {
       return InvalidArgument(
-          "Window bound at index %d in gather op is out of range, must be "
-          "within "
-          "[0, %lld), got %lld.",
-          i, corresponding_input_bound + 1, window_bound);
+          "Slice size at index %d in gather op is out of range, must be "
+          "within [0, %lld), got %lld.",
+          i, corresponding_input_size + 1, slice_size);
     }
   }
 
-  for (int i = 0; i < gather_dim_numbers.elided_window_dims_size(); i++) {
-    if (window_bounds[gather_dim_numbers.elided_window_dims(i)] != 1) {
+  for (int i = 0; i < gather_dim_numbers.collapsed_slice_dims_size(); i++) {
+    if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] != 1) {
       return InvalidArgument(
-          "Gather op can only elide window indices with bound 1, but bound is "
+          "Gather op can only collapse slice dims with bound 1, but bound is "
           "%lld for index %lld at position %d.",
-          window_bounds[gather_dim_numbers.elided_window_dims(i)],
-          gather_dim_numbers.elided_window_dims(i), i);
+          slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)],
+          gather_dim_numbers.collapsed_slice_dims(i), i);
     }
   }
 
-  int64 result_rank = gather_dim_numbers.output_window_dims_size() +
-                      (expanded_gather_indices_shape.size() - 1);
-  int64 window_dims_seen = 0;
+  int64 result_rank = gather_dim_numbers.offset_dims_size() +
+                      (expanded_start_indices_shape.size() - 1);
+  int64 offset_dims_seen = 0;
   int64 gather_dims_seen = 0;
   std::vector<int64> output_dim_bounds;
   output_dim_bounds.reserve(result_rank);
   for (int64 i = 0; i < result_rank; i++) {
     int64 current_bound;
-    bool is_window_index =
-        c_binary_search(gather_dim_numbers.output_window_dims(), i);
+    bool is_window_index = c_binary_search(gather_dim_numbers.offset_dims(), i);
     if (is_window_index) {
-      while (c_binary_search(gather_dim_numbers.elided_window_dims(),
-                             window_dims_seen)) {
-        window_dims_seen++;
+      while (c_binary_search(gather_dim_numbers.collapsed_slice_dims(),
+                             offset_dims_seen)) {
+        offset_dims_seen++;
       }
-      current_bound = window_bounds[window_dims_seen++];
+      current_bound = slice_sizes[offset_dims_seen++];
     } else {
       if (gather_dims_seen == gather_dim_numbers.index_vector_dim()) {
         gather_dims_seen++;
       }
-      current_bound = expanded_gather_indices_shape[gather_dims_seen++];
+      current_bound = expanded_start_indices_shape[gather_dims_seen++];
     }
 
     output_dim_bounds.push_back(current_bound);
@@ -2791,25 +2832,25 @@
       scatter_dim_numbers));
 
   int64 inserted_dims_seen = 0;
-  std::vector<int64> max_update_window_bounds;
+  std::vector<int64> max_update_slice_sizes;
   for (int i = 0; i < operand_shape.dimensions_size(); ++i) {
     if (inserted_dims_seen < scatter_dim_numbers.inserted_window_dims_size() &&
         scatter_dim_numbers.inserted_window_dims(inserted_dims_seen) == i) {
       ++inserted_dims_seen;
     } else {
-      max_update_window_bounds.push_back(operand_shape.dimensions(i));
+      max_update_slice_sizes.push_back(operand_shape.dimensions(i));
     }
   }
   for (int i = 0; i < scatter_dim_numbers.update_window_dims_size(); ++i) {
     auto update_window_dim = scatter_dim_numbers.update_window_dims(i);
     if (updates_shape.dimensions(update_window_dim) >
-        max_update_window_bounds[i]) {
+        max_update_slice_sizes[i]) {
       return InvalidArgument(
           "Bounds of the window dimensions of updates must not exceed the "
           "bounds of the corresponding dimensions of operand. For dimension "
           "%lld, updates bound is %lld, operand bound is %lld.",
           update_window_dim, updates_shape.dimensions(update_window_dim),
-          max_update_window_bounds[i]);
+          max_update_slice_sizes[i]);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 33da323..4974ac9 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -112,18 +112,30 @@
   // filter (rhs) to lhs in the way specified by the fields on window.
   static StatusOr<Shape> InferConvolveShape(
       const Shape& lhs, const Shape& rhs, const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Infers the shape produced by the given FFT type on the given operand.
   static StatusOr<Shape> InferFftShape(
       const Shape& in, FftType fft_type,
       tensorflow::gtl::ArraySlice<int64> fft_length);
 
-  // Infers the shape produced a cross replica sum with the given operand
+  // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferCrossReplicaSumShape(
       tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
 
+  // Infers final shape of an Alltoall operation that is created by the xla
+  // builder.
+  static StatusOr<Shape> InferAllToAllShape(const Shape& shape,
+                                            int64 split_dimension,
+                                            int64 concat_dimension,
+                                            int64 split_count);
+
+  // Infers the shape of an HLO all-to-all instruction.
+  static StatusOr<Shape> InferAllToAllTupleShape(
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
+
   // Infers the shape produced by applying the given reduction computation
   // shape to the given input operand shape.
   //
@@ -264,9 +276,9 @@
   // with the given input shape, gather indices shape and gather dimension
   // numbers.
   static StatusOr<Shape> InferGatherShape(
-      const Shape& input_shape, const Shape& gather_indices_shape,
+      const Shape& input_shape, const Shape& start_indices_shape,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   // Helper that validates the given input shape, scatter indices shape, updates
   // shape, and scatter dimension numbers that constitute a scatter operation,
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index a73fa18..4ed8fc6 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1654,11 +1654,11 @@
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{0},
-                                  /*elided_window_dims=*/{1},
-                                  /*gather_dims_to_operand_dims=*/{1},
+                                  /*offset_dims=*/{0},
+                                  /*collapsed_slice_dims=*/{1},
+                                  /*start_index_map=*/{1},
                                   /*index_vector_dim=*/1),
-                              /*window_bounds=*/{64, 1}));
+                              /*slice_sizes=*/{64, 1}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {64, 32})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1669,11 +1669,11 @@
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{1},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{1},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/1),
-                              /*window_bounds=*/{1, 48}));
+                              /*slice_sizes=*/{1, 48}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {32, 48})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1684,11 +1684,11 @@
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_4d_tensor_10_9_8_7_1_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{4},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{4},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/4),
-                              /*window_bounds=*/{1, 48}));
+                              /*slice_sizes=*/{1, 48}));
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1700,11 +1700,11 @@
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/4),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
       ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26})))
@@ -1717,11 +1717,11 @@
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/2),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
@@ -1735,11 +1735,11 @@
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/0),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
@@ -1749,16 +1749,15 @@
 
 TEST_F(ScatterGatherShapeInferenceTest, NoOutputGatherDims) {
   // This is equivalent to a dynamic slice.
-  TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
-      ShapeInference::InferGatherShape(
-          f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{0, 1, 2, 3, 4},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/0),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+                          ShapeInference::InferGatherShape(
+                              f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
+                              HloGatherInstruction::MakeGatherDimNumbers(
+                                  /*offset_dims=*/{0, 1, 2, 3, 4},
+                                  /*collapsed_slice_dims=*/{},
+                                  /*start_index_map=*/{0, 1, 2, 3, 4},
+                                  /*index_vector_dim=*/0),
+                              /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26})))
@@ -1772,11 +1771,11 @@
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{0, 1, 2, 3},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{0, 1, 2, 3},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/0),
-                              /*window_bounds=*/{1, 30, 29, 28, 27}));
+                              /*slice_sizes=*/{1, 30, 29, 28, 27}));
 
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {30, 29, 28, 27})))
@@ -1787,11 +1786,11 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       tuple_shape_, s64_vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/1),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Expected array argument for input"))
@@ -1802,11 +1801,11 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, tuple_shape_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/0),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Expected array argument for gather indices"))
@@ -1817,11 +1816,11 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/0),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Gather indices parameter must be an integral tensor"))
@@ -1833,11 +1832,11 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 8, 7},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 8, 7},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
@@ -1850,11 +1849,11 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 7},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 7},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
@@ -1867,14 +1866,14 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 99, 100, 101},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 99, 100, 101},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window index 2 in gather op is out of bounds"))
+              HasSubstr("Offset dimension 2 in gather op is out of bounds"))
       << statusor.status();
 }
 
@@ -1883,14 +1882,14 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 9},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 9},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window index 4 in gather op is out of bounds"))
+              HasSubstr("Offset dimension 4 in gather op is out of bounds"))
       << statusor.status();
 }
 
@@ -1899,16 +1898,16 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{4},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{4},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr("All components of the window index in a gather op must either "
-                "be a output window index or explicitly elided"))
+      HasSubstr("All components of the offset index in a gather op must either "
+                "be a offset dimension or explicitly collapsed"))
       << statusor.status();
 }
 
@@ -1917,14 +1916,14 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{0, 1, 2, 3, 19},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{0, 1, 2, 3, 19},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Invalid elided_window_dims set in gather op; valid "
+              HasSubstr("Invalid collapsed_slice_dims set in gather op; valid "
                         "range is [0, 5), got: 19"))
       << statusor.status();
 }
@@ -1934,16 +1933,15 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{0, 1, 2, 3, 3},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{0, 1, 2, 3, 3},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr(
-          "Repeated dimensions not allowed in elided_window_dims in gather op"))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Repeated dimensions not allowed in "
+                        "collapsed_slice_dims in gather op"))
       << statusor.status();
 }
 
@@ -1952,17 +1950,16 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr("Gather op has 4 elements in gather_dims_to_operand_dims and "
-                "the bound of dimension index_vector_dim=4 of "
-                "gather_indices is 5. These two numbers must be equal."))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Gather op has 4 elements in start_index_map and "
+                        "the bound of dimension index_vector_dim=4 of "
+                        "start_indices is 5. These two numbers must be equal."))
       << statusor.status();
 }
 
@@ -1971,16 +1968,14 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 7},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 7},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr("Invalid gather_dims_to_operand_dims mapping; domain is "
-                "[0, 5), got: 4->7"))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Invalid start_index_map; domain is [0, 5), got: 4->7"))
       << statusor.status();
 }
 
@@ -1989,16 +1984,15 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 3},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 3},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr(
-          "Repeated dimensions are not allowed in gather_dims_to_operand_dims"))
+      HasSubstr("Repeated dimensions are not allowed in start_index_map"))
       << statusor.status();
 }
 
@@ -2007,14 +2001,14 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{2, 1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{2, 1},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{1, 1, 28, 27, 26});
+      /*slice_sizes=*/{1, 1, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("elided_window_dims in gather op must be sorted"))
+              HasSubstr("collapsed_slice_dims in gather op must be sorted"))
       << statusor.status();
 }
 
@@ -2023,15 +2017,15 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7},
-          /*elided_window_dims=*/{2},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7},
+          /*collapsed_slice_dims=*/{2},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 1, 300, 26});
+      /*slice_sizes=*/{30, 29, 1, 300, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window bound at index 3 in gather op is out of range, "
-                        "must be within [0, 48), got 300"))
+              HasSubstr("Slice size at index 3 in gather op is out of range, "
+                        "must be within [0, 48), got 300."))
       << statusor.status();
 }
 
@@ -2040,16 +2034,15 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 26});
+      /*slice_sizes=*/{30, 29, 28, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr(
-          "Gather op must have one window bound for every input dimension"))
+      HasSubstr("Gather op must have one slice size for every input dimension"))
       << statusor.status();
 }
 
@@ -2058,15 +2051,15 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 26, 20});
+      /*slice_sizes=*/{30, 29, 28, 26, 20});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Gather op can only elide window indices with bound 1, "
-                        "but bound is 29 for index 1 at position 0"))
+              HasSubstr("Gather op can only collapse slice dims with bound 1, "
+                        "but bound is 29 for index 1 at position 0."))
       << statusor.status();
 }
 
@@ -2074,16 +2067,16 @@
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/32),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
 
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Gather index leaf dimension must be within [0, "
-                        "rank(gather_indices) + 1)"))
+                        "rank(start_indices) + 1)"))
       << statusor.status();
 }
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 7232c65..32d368a 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -43,15 +43,39 @@
 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer) {
   StatusOr<std::unique_ptr<Literal>> ret;
+
   se::Stream* substream = stream->GetOrCreateSubStream();
   substream->ThenWaitFor(stream);
   auto cleanup = tensorflow::gtl::MakeCleanup(
       [&]() { stream->ReturnSubStream(substream); });
 
   tensorflow::Notification n;
-  TransferLiteralFromDevice(substream, device_buffer,
-                            [&](StatusOr<std::unique_ptr<Literal>> arg) {
-                              ret = std::move(arg);
+  Status s;
+  Literal literal(device_buffer.on_host_shape());
+  TransferLiteralFromDevice(substream, device_buffer, literal,
+                            [&](Status status) {
+                              s = status;
+                              n.Notify();
+                            });
+  n.WaitForNotification();
+  if (!s.ok()) {
+    return s;
+  }
+  return MakeUnique<Literal>(std::move(literal));
+}
+
+Status TransferManager::TransferLiteralFromDevice(
+    se::Stream* stream, const ShapedBuffer& device_buffer,
+    const MutableBorrowingLiteral& literal) {
+  se::Stream* substream = stream->GetOrCreateSubStream();
+  auto cleanup = tensorflow::gtl::MakeCleanup(
+      [&]() { stream->ReturnSubStream(substream); });
+
+  Status ret;
+  tensorflow::Notification n;
+  TransferLiteralFromDevice(substream, device_buffer, literal,
+                            [&](Status status) {
+                              ret = status;
                               n.Notify();
                             });
   n.WaitForNotification();
@@ -76,22 +100,27 @@
 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
     se::Stream* stream, const Shape& shape,
     const se::DeviceMemoryBase& source) {
+  StatusOr<std::unique_ptr<Literal>> ret;
   // Implement the synchronous version by waiting on the asynchronous version.
   // Use a substream so that if we are called from a HostCallback we don't
   // deadlock.
-  StatusOr<std::unique_ptr<Literal>> ret;
   se::Stream* substream = stream->GetOrCreateSubStream();
   auto cleanup = tensorflow::gtl::MakeCleanup(
       [&]() { stream->ReturnSubStream(substream); });
 
   tensorflow::Notification n;
-  TransferArrayFromDevice(substream, shape, source,
-                          [&](StatusOr<std::unique_ptr<Literal>> arg) {
-                            ret = std::move(arg);
+  Literal literal(shape);
+  Status s;
+  TransferArrayFromDevice(substream, shape, source, literal,
+                          [&](Status status) {
+                            s = status;
                             n.Notify();
                           });
   n.WaitForNotification();
-  return ret;
+  if (!s.ok()) {
+    return s;
+  }
+  return MakeUnique<Literal>(std::move(literal));
 }
 
 Status TransferManager::TransferArrayToDevice(
@@ -130,7 +159,7 @@
 
 void TransferManager::TransferArrayFromDevice(
     se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
-    std::function<void(StatusOr<std::unique_ptr<Literal>>)> done) {
+    const MutableBorrowingLiteral& literal, std::function<void(Status)> done) {
   if (!ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) {
     auto error = StrCat("Shape ", ShapeUtil::HumanString(shape),
                         " has a differently shaped representation on-device: ",
@@ -147,7 +176,8 @@
                              stream->parent()->platform(),
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(source, /*index=*/{});
-  return TransferLiteralFromDevice(stream, shaped_buffer, std::move(done));
+  return TransferLiteralFromDevice(stream, shaped_buffer, literal,
+                                   std::move(done));
 }
 
 /* static */ void TransferManager::RegisterTransferManager(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 82c599e..475a2e5 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -59,6 +59,9 @@
   // This function should be avoided in favor of the asynchronous version below.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer);
+  virtual Status TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      const MutableBorrowingLiteral& literal);
 
   // Begins transferring a literal containing the data held in the given
   // ShapedBuffer using the provided executor.
@@ -69,9 +72,10 @@
   //
   // device_buffer is copied by reference and must live at least until done() is
   // invoked.
-  virtual void TransferLiteralFromDevice(
-      se::Stream* stream, const ShapedBuffer& device_buffer,
-      std::function<void(StatusOr<std::unique_ptr<Literal>>)> done) = 0;
+  virtual void TransferLiteralFromDevice(se::Stream* stream,
+                                         const ShapedBuffer& device_buffer,
+                                         MutableBorrowingLiteral literal,
+                                         std::function<void(Status)> done) = 0;
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
@@ -101,10 +105,10 @@
   // transfer an array at a known address.
   Status TransferArrayToDevice(se::Stream* stream, const LiteralSlice& literal,
                                const se::DeviceMemoryBase& dest);
-  void TransferArrayFromDevice(
-      se::Stream* stream, const Shape& shape,
-      const se::DeviceMemoryBase& source,
-      std::function<void(StatusOr<std::unique_ptr<Literal>>)> done);
+  void TransferArrayFromDevice(se::Stream* stream, const Shape& shape,
+                               const se::DeviceMemoryBase& source,
+                               const MutableBorrowingLiteral& literal,
+                               std::function<void(Status)> done);
 
   Status TransferArrayToDeviceAsync(se::Stream* stream,
                                     const LiteralSlice& literal,
@@ -120,9 +124,9 @@
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
-                                            const Shape& literal_shape,
-                                            Literal* literal) = 0;
+  virtual Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, const Shape& literal_shape,
+      MutableBorrowingLiteral literal) = 0;
 
   // Resets the devices associated with this transfer manager.
   virtual Status ResetDevices(
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
new file mode 100644
index 0000000..af2cb6d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -0,0 +1,238 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+
+// Finds and returns the non-constant operand in instr.
+//
+// CHECK-fails if instr doesn't have exactly one unique non-constant operand.
+static const HloInstruction* NonConstantOperand(const HloInstruction* instr) {
+  const HloInstruction* result = nullptr;
+  for (const HloInstruction* operand : instr->operands()) {
+    if (!operand->IsConstant()) {
+      if (result != nullptr) {
+        CHECK_EQ(result, operand);
+      }
+      result = operand;
+    }
+  }
+  CHECK_NE(result, nullptr);
+  return result;
+}
+
+// If all of instr's operands are either constants or have the form
+//   get-tuple-element(gte_operand, N)
+// for the same value N, returns N.  Otherwise, returns nullopt.
+static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
+                                          const HloInstruction* gte_operand) {
+  VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", "
+          << gte_operand->ToString() << ")";
+  optional<int64> tuple_idx;
+  for (const HloInstruction* operand : instr->operands()) {
+    if (operand->IsConstant()) {
+      continue;
+    }
+    // Look through copies.
+    // TODO(b/68830972): We wouldn't need this if for loop matching on the GPU
+    // would run before copy insertion.
+    if (operand->opcode() == HloOpcode::kCopy) {
+      operand = operand->operand(0);
+    }
+    if (operand->opcode() != HloOpcode::kGetTupleElement) {
+      VLOG(2) << "instr uses something other than gte(gte_operand): "
+              << operand->ToString();
+      return nullopt;
+    }
+    if (operand->operand(0) != gte_operand) {
+      VLOG(2) << "instr has gte whose operand is not gte_operand: "
+              << operand->ToString();
+      return nullopt;
+    }
+    if (tuple_idx && tuple_idx != operand->tuple_index()) {
+      VLOG(2) << "instr has operands with conflicting gte indices, "
+              << *tuple_idx << " vs " << operand->tuple_index();
+      return nullopt;
+    }
+
+    tuple_idx = operand->tuple_index();
+  }
+  return tuple_idx;
+}
+
+// Tries to get the tuple index of the induction variable of a while loop.
+//
+// Checks that the loop condition and root both plumb the induction variable
+// through the same tuple index, and that they both apply exactly one op to the
+// induction variable before  deciding whether to do another loop iteration (in
+// the loop condition's case) or packing the induction variable into the result
+// tuple (in the loop body's case).
+//
+// Specifically, checks that the loop condition has structure
+//
+//   root = op(constants, get-tuple-elem(param0, N), constants)
+//
+// and the loop body has the structure
+//
+//   inc = op(constants, get-tuple-elem(param0, N), constants)
+//   root = tuple(..., inc, ...)  // inc is N'th operand of tuple().
+//
+// If so, returns N.  Otherwise, returns nullopt.
+static optional<int64> GetLoopInductionVarTupleIdx(
+    const HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+  VLOG(2) << "Finding induction variable for loop "
+          << while_op->ToShortString();
+
+  // The while_cond computation should have the form
+  //
+  //   while_cond_root =
+  //       op(constants, get-tuple-elem(while_cond_param, N), constants).
+  //
+  // If it does, set indvar_tuple_idx to N.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_root = while_cond->root_instruction();
+  auto* while_cond_param = while_cond->parameter_instruction(0);
+  optional<int64> indvar_tuple_idx =
+      GetGTEOperandIndex(while_cond_root, while_cond_param);
+  if (!indvar_tuple_idx) {
+    VLOG(2) << "Induction variable not found in loop condition: "
+            << while_cond->root_instruction()->ToString();
+    return nullopt;
+  }
+
+  // The while_body computation should have the form
+  //
+  //   while_body_inc =
+  //       op(constants, get-tuple-elem(while_body_param, N), constants)
+  //   while_body_root = tuple(..., while_body_inc, ...)
+  //
+  // where while_body_inc is operand N of while_body_root.
+  auto* while_body = while_op->while_body();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While body's root is not a tuple instruction: "
+            << while_body_root->ToString();
+    return nullopt;
+  }
+
+  auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx);
+  auto* while_body_param = while_body->parameter_instruction(0);
+  optional<int64> while_body_indvar_tuple_idx =
+      GetGTEOperandIndex(while_body_inc, while_body_param);
+  if (!while_body_indvar_tuple_idx) {
+    VLOG(2)
+        << "Induction variable not found in while body increment instruction: "
+        << while_body_inc->ToString();
+    return nullopt;
+  }
+  if (while_body_indvar_tuple_idx != indvar_tuple_idx) {
+    VLOG(2) << "Tuple index of induction variable does not match between loop "
+               "condition ("
+            << *indvar_tuple_idx << ") and while body ("
+            << *while_body_indvar_tuple_idx << ")";
+    return nullopt;
+  }
+
+  // Finally, check that the while loop's initial value is a tuple with enough
+  // elements.
+  auto* while_init = while_op->operand(0);
+  if (while_init->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While init expected to be a tuple: " << while_init->ToString();
+    return nullopt;
+  }
+
+  VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx;
+  return indvar_tuple_idx;
+}
+
+optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
+                                          int64 max_value_returned) {
+  VLOG(2) << "Getting trip count for loop " << while_op->ToString();
+
+  // The loop's induction variable is found at
+  //
+  //   get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx),
+  //
+  // where comp is while_op->while_body() or while_op->while_condition().
+  optional<int64> indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op);
+  if (!indvar_tuple_idx) {
+    return nullopt;
+  }
+
+  // Now that we know the index of the induction variable, we can we can try to
+  // compute how many times the loop executes.  Start by computing the induction
+  // variable's initial value.
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  auto* while_init = while_op->mutable_operand(0);
+  auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
+  StatusOr<std::unique_ptr<Literal>> indvar_init_result =
+      evaluator.Evaluate(indvar_init);
+  if (!indvar_init_result.ok()) {
+    VLOG(2) << "Couldn't evaluate induction variable init: "
+            << indvar_init_result.status();
+    return nullopt;
+  }
+
+  auto* while_body = while_op->while_body();
+  auto* while_body_indvar_update =
+      while_body->root_instruction()->operand(*indvar_tuple_idx);
+  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
+
+  // The initial value of the induction variable.
+  std::unique_ptr<Literal> indvar_iter_val =
+      std::move(indvar_init_result).ValueOrDie();
+  for (int64 trip_count = 0; trip_count != max_value_returned + 1;
+       ++trip_count) {
+    auto* while_cond = while_op->while_condition();
+    auto* while_cond_root = while_cond->root_instruction();
+    auto* while_cond_indvar = NonConstantOperand(while_cond_root);
+    StatusOr<std::unique_ptr<Literal>> result =
+        evaluator.EvaluateWithSubstitutions(
+            while_cond_root, {{while_cond_indvar, indvar_iter_val.get()}});
+    if (!result.ok()) {
+      VLOG(2) << "Couldn't evaluate while cond: " << result.status();
+      return nullopt;
+    }
+    if (result.ValueOrDie()->data<bool>() ==
+        tensorflow::gtl::ArraySlice<bool>{false}) {
+      VLOG(2) << "Loop has static trip count of " << trip_count;
+      return trip_count;
+    }
+
+    // Calculate the value of the induction variable after one iteration of the
+    // loop, and check whether the while condition is true with this new value.
+    StatusOr<std::unique_ptr<Literal>> indvar_next_result =
+        evaluator.EvaluateWithSubstitutions(
+            while_body_indvar_update,
+            {{while_body_indvar, indvar_iter_val.get()}});
+    if (!indvar_next_result.ok()) {
+      VLOG(2) << "Couldn't evaluate induction variable update: "
+              << indvar_next_result.status();
+      return nullopt;
+    }
+    indvar_iter_val = std::move(indvar_next_result).ValueOrDie();
+  }
+
+  VLOG(2) << "Loop has unknown trip count.";
+  return nullopt;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
new file mode 100644
index 0000000..bf59813
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// Returns the precise trip count of the loop if it's statically known,
+// nullopt otherwise. max_value_returned limits the number of steps that are
+// evaluated while trying to brute force a loop trip count, trip counts larger
+// than max_value_returned result in nullopt.
+tensorflow::gtl::optional<int64> ComputeWhileLoopTripCount(
+    HloInstruction *while_op, int64 max_value_returned = 128);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 266039d..0e7667d 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -206,7 +206,8 @@
   p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
   p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
 
-  outfeed = token[] outfeed(p_body.0)
+  token = token[] after-all()
+  outfeed = token[] outfeed(p_body.0, token)
   ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1)
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index ec05a74..dd8697e 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -15,7 +15,7 @@
 
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -26,23 +26,6 @@
 using tensorflow::gtl::nullopt;
 using tensorflow::gtl::optional;
 
-// Finds and returns the non-constant operand in instr.
-//
-// CHECK-fails if instr doesn't have exactly one unique non-constant operand.
-static const HloInstruction* NonConstantOperand(const HloInstruction* instr) {
-  const HloInstruction* result = nullptr;
-  for (const HloInstruction* operand : instr->operands()) {
-    if (!operand->IsConstant()) {
-      if (result != nullptr) {
-        CHECK_EQ(result, operand);
-      }
-      result = operand;
-    }
-  }
-  CHECK_NE(result, nullptr);
-  return result;
-}
-
 // Determines whether the given instruction is a send/recv node, or has a
 // subcomputation which contains a send/recv node.
 static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
@@ -72,211 +55,6 @@
   return false;
 }
 
-// If all of instr's operands are either constants or have the form
-//   get-tuple-element(gte_operand, N)
-// for the same value N, returns N.  Otherwise, returns nullopt.
-static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
-                                          const HloInstruction* gte_operand) {
-  VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", "
-          << gte_operand->ToString() << ")";
-  optional<int64> tuple_idx;
-  for (const HloInstruction* operand : instr->operands()) {
-    if (operand->IsConstant()) {
-      continue;
-    }
-    if (operand->opcode() != HloOpcode::kGetTupleElement) {
-      VLOG(2) << "instr uses something other than gte(gte_operand): "
-              << operand->ToString();
-      return nullopt;
-    }
-    if (operand->operand(0) != gte_operand) {
-      VLOG(2) << "instr has gte whose operand is not gte_operand: "
-              << operand->ToString();
-      return nullopt;
-    }
-    if (tuple_idx && tuple_idx != operand->tuple_index()) {
-      VLOG(2) << "instr has operands with conflicting gte indices, "
-              << *tuple_idx << " vs " << operand->tuple_index();
-      return nullopt;
-    }
-
-    tuple_idx = operand->tuple_index();
-  }
-  return tuple_idx;
-}
-
-// Tries to get the tuple index of the induction variable of a while loop.
-//
-// Checks that the loop condition and root both plumb the induction variable
-// through the same tuple index, and that they both apply exactly one op to the
-// induction variable before  deciding whether to do another loop iteration (in
-// the loop condition's case) or packing the induction variable into the result
-// tuple (in the loop body's case).
-//
-// Specifically, checks that the loop condition has structure
-//
-//   root = op(constants, get-tuple-elem(param0, N), constants)
-//
-// and the loop body has the structure
-//
-//   inc = op(constants, get-tuple-elem(param0, N), constants)
-//   root = tuple(..., inc, ...)  // inc is N'th operand of tuple().
-//
-// If so, returns N.  Otherwise, returns nullopt.
-static optional<int64> GetLoopInductionVarTupleIdx(
-    const HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-  VLOG(2) << "Finding induction variable for loop "
-          << while_op->ToShortString();
-
-  // The while_cond computation should have the form
-  //
-  //   while_cond_root =
-  //       op(constants, get-tuple-elem(while_cond_param, N), constants).
-  //
-  // If it does, set indvar_tuple_idx to N.
-  auto* while_cond = while_op->while_condition();
-  auto* while_cond_root = while_cond->root_instruction();
-  auto* while_cond_param = while_cond->parameter_instruction(0);
-  optional<int64> indvar_tuple_idx =
-      GetGTEOperandIndex(while_cond_root, while_cond_param);
-  if (!indvar_tuple_idx) {
-    VLOG(2) << "Induction variable not found in loop condition: "
-            << while_cond->root_instruction()->ToString();
-    return nullopt;
-  }
-
-  // The while_body computation should have the form
-  //
-  //   while_body_inc =
-  //       op(constants, get-tuple-elem(while_body_param, N), constants)
-  //   while_body_root = tuple(..., while_body_inc, ...)
-  //
-  // where while_body_inc is operand N of while_body_root.
-  auto* while_body = while_op->while_body();
-  auto* while_body_root = while_body->root_instruction();
-  if (while_body_root->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While body's root is not a tuple instruction: "
-            << while_body_root->ToString();
-    return nullopt;
-  }
-
-  auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx);
-  auto* while_body_param = while_body->parameter_instruction(0);
-  optional<int64> while_body_indvar_tuple_idx =
-      GetGTEOperandIndex(while_body_inc, while_body_param);
-  if (!while_body_indvar_tuple_idx) {
-    VLOG(2)
-        << "Induction variable not found in while body increment instruction: "
-        << while_body_inc->ToString();
-    return nullopt;
-  }
-  if (while_body_indvar_tuple_idx != indvar_tuple_idx) {
-    VLOG(2) << "Tuple index of induction variable does not match between loop "
-               "condition ("
-            << *indvar_tuple_idx << ") and while body ("
-            << *while_body_indvar_tuple_idx << ")";
-    return nullopt;
-  }
-
-  // Finally, check that the while loop's initial value is a tuple with enough
-  // elements.
-  auto* while_init = while_op->operand(0);
-  if (while_init->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While init expected to be a tuple: " << while_init->ToString();
-    return nullopt;
-  }
-
-  VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx;
-  return indvar_tuple_idx;
-}
-
-// Tries to determine the number of times the given loop executes.  Currently
-// simply returns 0, 1, or "can't tell" (nullopt).
-static optional<int64> GetLoopTripCount(HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-  VLOG(2) << "Getting trip count for loop " << while_op->ToString();
-
-  // The loop's induction variable is found at
-  //
-  //   get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx),
-  //
-  // where comp is while_op->while_body() or while_op->while_condition().
-  optional<int64> indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op);
-  if (!indvar_tuple_idx) {
-    return nullopt;
-  }
-
-  VLOG(2) << "Induction variable is at index " << *indvar_tuple_idx
-          << " in input tuple.";
-
-  // Now that we know the index of the induction variable, we can we can try to
-  // compute how many times the loop executes.  Start by computing the induction
-  // variable's initial value.
-  HloEvaluator evaluator(/*max_loop_iterations=*/0);
-  auto* while_init = while_op->mutable_operand(0);
-  auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
-  StatusOr<std::unique_ptr<Literal>> indvar_init_result =
-      evaluator.Evaluate(indvar_init);
-  if (!indvar_init_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable init: "
-            << indvar_init_result.status();
-    return nullopt;
-  }
-
-  // Evaluates the while loop's condition, returning either "true" (continue
-  // looping), "false" (stop looping), or nullopt (can't evaluate).
-  auto evaluate_while_cond = [&](const Literal& indvar) -> optional<bool> {
-    auto* while_cond = while_op->while_condition();
-    auto* while_cond_root = while_cond->root_instruction();
-    auto* while_cond_indvar = NonConstantOperand(while_cond_root);
-    StatusOr<std::unique_ptr<Literal>> result =
-        evaluator.EvaluateWithSubstitutions(while_cond_root,
-                                            {{while_cond_indvar, &indvar}});
-    if (!result.ok()) {
-      VLOG(2) << "Couldn't evaluate while cond: " << result.status();
-      return nullopt;
-    }
-    return result.ValueOrDie()->data<bool>() ==
-           tensorflow::gtl::ArraySlice<bool>{true};
-  };
-
-  // The initial value of the induction variable.
-  const Literal& indvar_iter0_val = *indvar_init_result.ValueOrDie();
-
-  // Evaluate whether the while condition is true when seeded with
-  // indvar_iter0_val.
-  optional<bool> while_cond_iter0_val = evaluate_while_cond(indvar_iter0_val);
-  if (while_cond_iter0_val == false) {
-    VLOG(2) << "Loop has static trip count of 0.";
-    return 0;
-  }
-
-  // Calculate the value of the induction variable after one iteration of the
-  // loop, and check whether the while condition is true with this new value.
-  auto* while_body = while_op->while_body();
-  auto* while_body_indvar_update =
-      while_body->root_instruction()->operand(*indvar_tuple_idx);
-  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
-  StatusOr<std::unique_ptr<Literal>> indvar_iter1_result =
-      evaluator.EvaluateWithSubstitutions(
-          while_body_indvar_update, {{while_body_indvar, &indvar_iter0_val}});
-  if (!indvar_iter1_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable update: "
-            << indvar_iter1_result.status();
-    return nullopt;
-  }
-  const Literal& indvar_iter1_val = *indvar_iter1_result.ValueOrDie();
-  optional<bool> while_cond_iter1_val = evaluate_while_cond(indvar_iter1_val);
-  if (while_cond_iter1_val == false) {
-    VLOG(2) << "Determined that loop has static trip count of 1.";
-    return 1;
-  }
-
-  VLOG(2) << "Loop has unknown trip count >= 1.";
-  return nullopt;
-}
-
 // Tries to remove elements in a while loop's tuple that aren't used within the
 // loop.
 //
@@ -577,7 +355,9 @@
   }
 
   // Remove while loops with static trip count of 0.
-  optional<int64> trip_count = GetLoopTripCount(while_op);
+  optional<int64> trip_count =
+      ComputeWhileLoopTripCount(while_op,
+                                /*max_value_returned=*/1);
   if (trip_count && *trip_count == 0) {
     // The loop never executes, so the value of the loop is the value of its
     // "init" operand.
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 34869cc..b69c346 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1014,12 +1014,13 @@
 }
 
 /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
+  if (!IsTuple(shape)) {
+    return 1;
+  }
   int64 count = 0;
-  ForEachSubshape(shape, [&](const Shape&, const ShapeIndex& index) {
-    if (IsLeafIndex(shape, index)) {
-      ++count;
-    }
-  });
+  for (const Shape& subshape : shape.tuple_shapes()) {
+    count += GetLeafCount(subshape);
+  }
   return count;
 }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 42d52ae..e280492 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -709,6 +709,21 @@
     ],
 )
 
+xla_test(
+    name = "scatter_test",
+    srcs = ["scatter_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 # Repeat dot_operation_runtime_test with single-threaded eigen.
 xla_test(
     name = "dot_operation_single_threaded_runtime_test",
@@ -798,6 +813,7 @@
     "//tensorflow/compiler/xla/client:padding",
     "//tensorflow/compiler/xla/client:xla_builder",
     "//tensorflow/compiler/xla/tests:client_library_test_base",
+    "//tensorflow/compiler/xla/tests:hlo_test_base",
     "//tensorflow/compiler/xla/tests:literal_test_util",
     "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     "//tensorflow/core:lib",
@@ -1140,6 +1156,7 @@
     name = "reduce_window_test",
     timeout = "long",
     srcs = [],
+    shard_count = 20,
     tags = [
         "enable_for_xla_interpreter",
         "optonly",
@@ -2061,6 +2078,8 @@
 xla_test(
     name = "test_utils_test",
     srcs = ["test_utils_test.cc"],
+    # There is nothing backend specific in this test, so just pick an arbitrary backend.
+    backends = ["cpu"],
     deps = [
         ":local_client_test_base",
         ":test_utils",
@@ -2069,6 +2088,7 @@
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 4a6e8a3..b04a3b1 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -74,8 +74,9 @@
   string TestName() const;
 
   void SetFastMathDisabled(bool disabled) {
-    execution_options_.mutable_debug_options()->set_xla_enable_fast_math(
-        !disabled);
+    auto* opts = execution_options_.mutable_debug_options();
+    opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_gpu_enable_fast_math(!disabled);
   }
 
   void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 5ed8122..689928a 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -31,6 +31,7 @@
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -465,7 +466,7 @@
 }
 
 template <typename T>
-class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
+class Convolve2D_1x3x3x5_3x3x5x3_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
@@ -520,8 +521,139 @@
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x5_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x5_Valid, Types) { this->RunTest(); }
+TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x3_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x3_Valid, Types) { this->RunTest(); }
+
+template <typename T>
+class Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 3, 3, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 15};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(16029), static_cast<T>(16218), static_cast<T>(16407),
+         static_cast<T>(17172), static_cast<T>(17370), static_cast<T>(17568),
+         static_cast<T>(18369), static_cast<T>(18576), static_cast<T>(18783),
+         static_cast<T>(19620), static_cast<T>(19836), static_cast<T>(20052),
+         static_cast<T>(20925), static_cast<T>(21150), static_cast<T>(21375)});
+    auto expected_r4 = expected_r1->Reshape({1, 1, 1, 15}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(*input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(*filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, *expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 6};
+    std::vector<int64> filter_dims = {2, 2, 2, 12};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/3);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
+         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
+         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
+         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
+    auto expected_r4 = expected_r1->Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(*input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(*filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, *expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+  this->RunTest();
+}
 
 // Test fixture to run convolution tests with and without convolution
 // canonicalization enabled.
@@ -765,5 +897,44 @@
                      std::move(*LiteralUtil::CreateFromArray(filter_data))});
 }
 
+class ConvolutionHloTest : public HloTestBase {};
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64Forward)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f64[3,56,56,16] parameter(0)
+  %arg1 = f64[3,3,3,64] parameter(1)
+  ROOT %conv = f64[54,54,16,64] convolution(%arg0, %arg1), window={size=3x3}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f64[2,5,8,1] parameter(0)
+  %arg1 = f64[2,5,8,2] parameter(1)
+  ROOT %conv = f64[4,4,1,2] convolution(%arg0, %arg1), window={size=5x8 pad=1_2x1_2}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardInput)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %output = f64[4,5,16,16] parameter(0)
+  %kernel = f64[5,3,7,7] parameter(1)
+  %reverse = f64[5,3,7,7] reverse(f64[5,3,7,7] %kernel), dimensions={2,3}
+  ROOT %convolution = f64[4,3,16,16] convolution(%output, %reverse), window={size=7x7 pad=3_3x3_3}, dim_labels=bf01_io01->bf01
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index b77bece..f866ed6 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -30,8 +30,8 @@
 class GatherOperationTest : public HloTestBase {
  protected:
   void RunTest(const string& hlo_text, Literal* operand,
-               Literal* gather_indices) {
-    RunTest(hlo_text, {operand, gather_indices});
+               Literal* start_indices) {
+    RunTest(hlo_text, {operand, start_indices});
   }
 
   void RunTest(const string& hlo_text,
@@ -52,18 +52,17 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherV2) {
@@ -74,18 +73,17 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherMultipleBatchDims) {
@@ -96,18 +94,18 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_0) {
@@ -118,18 +116,18 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_1) {
@@ -140,18 +138,18 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   ROOT gather = s32[2,1,1,2] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNd) {
@@ -162,20 +160,20 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdNonDefaultIndexVectorDim) {
@@ -186,20 +184,20 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, DynamicSlice) {
@@ -210,18 +208,17 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, BatchDynamicSlice) {
@@ -232,18 +229,18 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ZeroDimBounds) {
@@ -254,17 +251,16 @@
   operand = s32[3,0] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,0] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 0}
+      slice_sizes={1, 0}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OutOfBoundsIndex) {
@@ -278,19 +274,19 @@
   operand = s32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OutOfBoundsUnsignedIndex) {
@@ -304,19 +300,19 @@
   operand = s32[3,3]{1,0} parameter(0)
   indices = u32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<uint32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<uint32>(
       {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, NegativeIndex) {
@@ -330,19 +326,19 @@
   operand = s32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, NegativeIndexIntoUnsignedOperand) {
@@ -356,19 +352,19 @@
   operand = u32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = u32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = u32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<uint32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OneScalarIndex) {
@@ -379,17 +375,17 @@
   operand = s32[2,3,2]{2,1,0} parameter(0)
   index = s32[] parameter(1)
   ROOT gather = s32[1,3,2]{2,1,0} gather(operand, index),
-      output_window_dims={0,1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0},
+      offset_dims={0,1,2},
+      collapsed_slice_dims={},
+      start_index_map={0},
       index_vector_dim=0,
-      window_bounds={1,3,2}
+      slice_sizes={1,3,2}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR3<int32>(
       {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR0<int32>(1);
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ScalarResult) {
@@ -400,16 +396,16 @@
   operand = s32[4]{0} parameter(0)
   index = s32[] parameter(1)
   ROOT gather = s32[] gather(operand, index),
-      output_window_dims={},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=0,
-      window_bounds={1}
+      slice_sizes={1}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR0<int32>(1);
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ZeroSizedResult) {
@@ -420,17 +416,17 @@
   operand = s32[3,3] parameter(0)
   indices = s32[0] parameter(1)
   ROOT gather = s32[0,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR1<int32>({});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherV2) {
@@ -441,11 +437,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[3,2] broadcast(one), dimensions={}
   ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
@@ -453,9 +449,8 @@
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherMultipleBatchDims) {
@@ -466,11 +461,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
   ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
@@ -478,9 +473,9 @@
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNdMultipleBatchDims) {
@@ -491,11 +486,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -503,9 +498,9 @@
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNd) {
@@ -516,11 +511,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -530,9 +525,9 @@
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest,
@@ -544,11 +539,11 @@
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -558,9 +553,9 @@
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedDynamicSlice) {
@@ -571,11 +566,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[1,1] broadcast(one), dimensions={}
   ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
@@ -583,9 +578,8 @@
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedBatchDynamicSlice) {
@@ -596,11 +590,11 @@
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
   ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
@@ -608,9 +602,9 @@
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
@@ -622,11 +616,11 @@
   //   operand = s32[3,3] parameter(0)
   //   indices = s32[2] parameter(1)
   //   ROOT gather = s32[2,3] gather(operand, indices),
-  //       output_window_dims={1},
-  //       elided_window_dims={0},
-  //       gather_dims_to_operand_dims={0},
+  //       offset_dims={1},
+  //       collapsed_slice_dims={0},
+  //       start_index_map={0},
   //       index_vector_dim=1,
-  //       window_bounds={1, 3}
+  //       slice_sizes={1, 3}
   // }
 
   XlaBuilder builder("gather_basic");
@@ -637,9 +631,9 @@
   auto operand = Parameter(&builder, 0, operand_shape, "operand");
   auto indices = Parameter(&builder, 1, indices_shape, "indices");
   GatherDimensionNumbers dim_numbers;
-  dim_numbers.add_output_window_dims(1);
-  dim_numbers.add_elided_window_dims(0);
-  dim_numbers.add_gather_dims_to_operand_dims(0);
+  dim_numbers.add_offset_dims(1);
+  dim_numbers.add_collapsed_slice_dims(0);
+  dim_numbers.add_start_index_map(0);
   dim_numbers.set_index_vector_dim(1);
   Gather(operand, indices, dim_numbers, {1, 3});
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index b662e83..64e361f 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,9 +23,11 @@
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -83,13 +85,16 @@
 
 }  // namespace
 
-HloTestBase::HloTestBase()
-    : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {}
+HloTestBase::HloTestBase(bool allow_mixed_precision_in_hlo_verifier)
+    : HloTestBase(GetTestPlatform(), GetReferencePlatform(),
+                  allow_mixed_precision_in_hlo_verifier) {}
 
 HloTestBase::HloTestBase(se::Platform* test_platform,
-                         se::Platform* reference_platform)
+                         se::Platform* reference_platform,
+                         bool allow_mixed_precision_in_hlo_verifier)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ = MakeUnique<HloVerifier>(/*allow_mixed_precision=*/true);
+  hlo_verifier_ =
+      MakeUnique<HloVerifier>(allow_mixed_precision_in_hlo_verifier);
 }
 
 /* static */
@@ -97,7 +102,23 @@
   return MakeUnique<HloModule>(name, GetModuleConfigForTest());
 }
 
-/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
+/* static */
+StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
+                                       HloModule* module) {
+  const string module_str_before_run = module->ToProto().ShortDebugString();
+  const auto status_or = hlo_pass->Run(module);
+  if (status_or.status().ok()) {
+    const string module_str_after_run = module->ToProto().ShortDebugString();
+    if (!status_or.ValueOrDie()) {
+      // Check that the proto remains same.
+      EXPECT_EQ(module_str_after_run, module_str_before_run);
+    }
+  }
+  return status_or;
+}
+
+/*static*/
+DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
@@ -233,6 +254,29 @@
                        reference_preprocessor);
 }
 
+::testing::AssertionResult HloTestBase::Run(const StringPiece hlo_string) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+  const auto& fake_arguments =
+      MakeFakeArguments(module_or_status.ValueOrDie().get())
+          .ConsumeValueOrDie();
+  std::vector<Literal*> fake_argument_ptrs;
+  c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
+  return test_runner_
+                 .Execute(std::move(module_or_status.ValueOrDie()),
+                          fake_argument_ptrs, /*run_hlo_passes=*/true)
+                 .ok()
+             ? ::testing::AssertionSuccess()
+             : ::testing::AssertionFailure();
+}
+
 ::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 66719b1..c860c41 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -75,17 +75,24 @@
   static std::unique_ptr<HloModule> CreateNewModule(
       const string& name = TestName());
 
+  // Runs the hlo_pass with the provided module and returns the result. This
+  // function also verifies that the module remains unchanged when hlo_pass
+  // returns false as the StatusOr value.
+  static StatusOr<bool> RunHloPass(HloPassInterface* hlo_pass,
+                                   HloModule* module);
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
   // interpreter is the only supported backend, it will be both the test backend
   // and the reference backend.
-  HloTestBase();
+  HloTestBase(bool allow_mixed_precision_in_hlo_verifier = true);
 
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
-  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform);
+  HloTestBase(se::Platform* test_platform, se::Platform* reference_platform,
+              bool allow_mixed_precision_in_hlo_verifier = true);
 
   ~HloTestBase() override {}
 
@@ -166,6 +173,8 @@
       const tensorflow::gtl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
+  ::testing::AssertionResult Run(const tensorflow::StringPiece hlo_string)
+      TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareFromFile(
       const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index e310966..60eb21a 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -92,10 +92,10 @@
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 3);
-  CHECK_EQ(result->buffer_sizes()[0], -2);             // param buffer
-  CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
-  CHECK_EQ(result->buffer_sizes()[2], -1);             // const buffer
+  CHECK_EQ(result->buffer_infos().size(), 3);
+  CHECK(result->buffer_infos()[0].is_entry_parameter());      // param buffer
+  CHECK_EQ(result->buffer_infos()[1].size(), sizeof(float));  // result buffer
+  CHECK(result->buffer_infos()[2].is_constant());             // const buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 1bd6fda..cae029f 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1261,6 +1261,12 @@
      /*pad_low=*/{5},
      /*pad_high=*/{0},
      /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{4096}, /*window_bounds=*/{4096},
+     /*strides=*/{1},
+     /*pad_low=*/{4095},
+     /*pad_high=*/{0},
+     /*reducer=*/Reducer::kMax},
 };
 
 string R1ReduceWindowTestDataToString(
@@ -1341,7 +1347,7 @@
 // results on the interpreter backend.
 class ReduceWindowTextTest : public HloTestBase {};
 
-TEST_F(ReduceWindowTextTest, R2General256x384) {
+XLA_TEST_F(ReduceWindowTextTest, R2General256x384) {
   const string hlo_string = R"(
 HloModule R2Window
 mul {
@@ -1358,7 +1364,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-TEST_F(ReduceWindowTextTest, R2General256x384Layout01) {
+XLA_TEST_F(ReduceWindowTextTest, R2General256x384Layout01) {
   const string hlo_string = R"(
 HloModule R2Window
 mul {
@@ -1375,7 +1381,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-TEST_F(ReduceWindowTextTest, R2General2x5) {
+XLA_TEST_F(ReduceWindowTextTest, R2General2x5) {
   const string hlo_string = R"(
 HloModule R2Window
 mul {
@@ -1392,7 +1398,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
+XLA_TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
   const string hlo_string = R"(
 HloModule R2Window
 mul {
@@ -1410,7 +1416,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
+XLA_TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
   const string hlo_string = R"(
 HloModule R3Window
 mul {
@@ -1428,7 +1434,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-TEST_F(HloTestBase, ReduceWindowIdentity) {
+XLA_TEST_F(HloTestBase, ReduceWindowIdentity) {
   const string hlo_string = R"(
 HloModule ReduceWindowIdentity
 identity.pad_to_reduce_window {
@@ -1445,7 +1451,7 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
 }
 
-TEST_F(HloTestBase, ReduceWindowS32) {
+XLA_TEST_F(HloTestBase, ReduceWindowS32) {
   const string hlo_string = R"(
 HloModule reduce-window
 
@@ -1464,5 +1470,24 @@
   EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, ReduceWindowF16) {
+  const string hlo_string = R"(
+HloModule reduce-window
+
+%identity.pad_to_reduce_window (param0: f16[], param1: f16[]) -> f16[] {
+  %param0 = f16[] parameter(0)
+  ROOT %param1 = f16[] parameter(1)
+}
+
+ENTRY %reduce-window (parameter.0: f16[81,8], parameter.1: f16[]) -> f16[82,8] {
+  %parameter.0 = f16[81,8]{1,0} parameter(0)
+  %parameter.1 = f16[] parameter(1)
+  ROOT %reduce-window = f16[82,8]{1,0} reduce-window(f16[81,8]{1,0} %parameter.0, f16[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
new file mode 100644
index 0000000..922d70b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -0,0 +1,615 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class ScatterTest : public HloTestBase {
+ protected:
+  void RunTest(const string& hlo_text, Literal* operand,
+               Literal* scatter_indices, Literal* updates) {
+    RunTest(hlo_text, {operand, scatter_indices, updates});
+  }
+
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseHloString(hlo_text, config));
+    EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
+  }
+};
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterV1_Update) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_Update) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 30}, {40, 60}, {70, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Add) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_Add
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Mul) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_Mul
+
+mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=mul_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_F32) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_F32
+
+add_f32 (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(f32[] lhs, f32[] rhs)
+}
+
+ENTRY main {
+  operand = f32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = f32[2,3] parameter(2)
+  ROOT scatter = f32[3,3] scatter(operand, indices, updates),
+      to_apply=add_f32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<float>(
+      {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({2, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<float>({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_RepeatedIndices) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_MultipleBatchDims) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterMultipleBatchDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=2
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterNd) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNd
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-40, 40}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterNd_NonDefaultIndexVectorDim) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-20, 20}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, DynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule DynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0,1},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{10}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, BatchDynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule BatchDynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR3<int32>({{{10}}, {{20}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, ZeroDimBounds) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter_ZeroDimBounds
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,0] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,0] parameter(2)
+  ROOT scatter = s32[3,0] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{}, {}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, NoUpdateWindowDims) {
+  const string hlo_text = R"(
+HloModule Scatter_NoUpdateWindowDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[2,2,1] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20}, {30, 40}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OutOfBoundsIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OutOfBoundsUnsignedIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = u32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<uint32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, NegativeIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OneScalarIndex) {
+  const char* hlo_text = R"(
+HloModule OneScalarIndex
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[2,3,2]{2,1,0} parameter(0)
+  index = s32[] parameter(1)
+  updates = s32[1,3,2]{2,1,0} parameter(2)
+  ROOT scatter = s32[2,3,2]{2,1,0} scatter(operand, index, updates),
+      to_apply=update_s32,
+      update_window_dims={0,1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR3<int32>(
+      {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR0<int32>(1);
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR3<int32>({{{10, 20}, {30, 40}, {50, 60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, ScalarUpdate) {
+  const char* hlo_text = R"(
+HloModule ScalarUpdate
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[4]{0} parameter(0)
+  index = s32[] parameter(1)
+  updates = s32[] parameter(2)
+  ROOT scatter = s32[4]{0} scatter(operand, index, updates),
+      to_apply=update_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR0<int32>(1);
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR0<int32>(25);
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, EmptyIndices) {
+  const string hlo_text = R"(
+HloModule EmptyIndices
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[0] parameter(1)
+  updates = s32[0] parameter(2)
+  ROOT scatter = s32[3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR1<int32>({});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 2647937..f05421f 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -13,6 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
+
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -26,89 +28,101 @@
 
 template <typename FloatT, typename GeneratorT>
 void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
-                                             std::minstd_rand0* engine) {
+                                             std::minstd_rand0* engine,
+                                             bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
-  // numbers.
-  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
-  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
-  TF_CHECK_OK(literal->Populate<FloatT>(
-      [&](tensorflow::gtl::ArraySlice<int64> indices) {
-        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
-        // with a position dependent number with mean 0.037109375. These number
-        // should allow for long chains of accumulation without being too close
-        // to zero or too large to accumulate all numbers accurately. Only do
-        // this for large literals where the number of elements is much greater
-        // than 47 otherwise only negative values are produced.
-        //
-        // The value is positionally biased using a product of the indices. Add
-        // one to each index value to avoid collapsing to zero if any of the
-        // indices are zero.
-        int64 index_product = 1;
-        for (int64 i : indices) {
-          index_product *= (1 + i);
-        }
-        const int64 negative_bias = should_index_bias ? 47 : 0;
-        FloatT index_bias =
-            static_cast<FloatT>(index_product % 113 - negative_bias) /
-            static_cast<FloatT>(256.0f);
-        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
-      }));
+  if (no_duplicates) {
+    // Duplicates may be generated if the number of elements in the literal
+    // exceeds the number of positive values supported by the type.
+    FloatT next_value = std::numeric_limits<FloatT>::min();
+    for (FloatT& value : literal->data<FloatT>()) {
+      value = next_value;
+      next_value =
+          std::nextafter(next_value, std::numeric_limits<FloatT>::max());
+    }
+    std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+                 *engine);
+  } else {
+    std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
+    for (FloatT& value : literal->data<FloatT>()) {
+      value = static_cast<FloatT>(generator(*engine));
+    }
+  }
 }
 
 template <typename FloatT>
 void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+                                         std::minstd_rand0* engine,
+                                         bool no_duplicates) {
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine,
+                                                          no_duplicates);
 }
 
 template <>
 void PopulateWithRandomFloatingPointData<half>(Literal* literal,
-                                               std::minstd_rand0* engine) {
+                                               std::minstd_rand0* engine,
+                                               bool no_duplicates) {
+  // no_duplicates is ignored for half types. Unique values can only be
+  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
+  // best-effort anyway.
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+  for (half& value : literal->data<half>()) {
+    value = static_cast<half>(generator(*engine));
+  }
 }
 
-// The standard library does not have a case for bfloat16, unsurprisingly, so we
-// handle that one specially.
 template <>
 void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
-                                                   std::minstd_rand0* engine) {
+                                                   std::minstd_rand0* engine,
+                                                   bool no_duplicates) {
+  // no_duplicates is ignored for bfloat types. Unique values can only be
+  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
+  // best-effort anyway.
   CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(), BF16);
-  std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
-  TF_CHECK_OK(literal->Populate<bfloat16>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return static_cast<bfloat16>(generator(*engine));
-      }));
+  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+  for (bfloat16& value : literal->data<bfloat16>()) {
+    value = static_cast<bfloat16>(generator(*engine));
+  }
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal,
-                                    std::minstd_rand0* engine) {
+void PopulateWithRandomIntegralData(Literal* literal, std::minstd_rand0* engine,
+                                    bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::uniform_int_distribution<IntT> generator(
-      std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
-  TF_CHECK_OK(literal->Populate<IntT>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(*engine);
-      }));
+  if (no_duplicates && ShapeUtil::ElementsIn(literal->shape()) <
+                           std::numeric_limits<IntT>::max()) {
+    std::iota(literal->data<IntT>().begin(), literal->data<IntT>().end(), 0);
+    std::shuffle(literal->data<IntT>().begin(), literal->data<IntT>().end(),
+                 *engine);
+  } else {
+    std::uniform_int_distribution<IntT> generator(
+        std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
+    for (IntT& value : literal->data<IntT>()) {
+      value = generator(*engine);
+    }
+  }
 }
 
 // Similar to MakeFakeLiteral but takes a random number generator engine to
-// enable reusing the engine across randomly generated literals.
+// enable reusing the engine across randomly generated literals. 'no_duplicates'
+// indicates that there should be no duplicate values in each generated
+// array. This is uniqueness is best-effort only. Some types (half and bfloat16)
+// are not supported and uniqueness cannot be guaranteed if the number of
+// elements exceeds the number of different values supported by the type.
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
-    const Shape& shape, std::minstd_rand0* engine) {
+    const Shape& shape, std::minstd_rand0* engine, bool no_duplicates) {
   if (ShapeUtil::IsTuple(shape)) {
     std::vector<std::unique_ptr<Literal>> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteralInternal(element_shape, engine));
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<Literal> element,
+          MakeFakeLiteralInternal(element_shape, engine, no_duplicates));
       elements.push_back(std::move(element));
     }
     return LiteralUtil::MakeTupleOwned(std::move(elements));
@@ -119,40 +133,52 @@
   auto literal = MakeUnique<Literal>(shape);
   switch (shape.element_type()) {
     case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine,
+                                                    no_duplicates);
       break;
     case F16:
-      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine,
+                                                no_duplicates);
       break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine,
+                                                 no_duplicates);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine,
+                                                  no_duplicates);
       break;
     case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine,
+                                           no_duplicates);
       break;
     case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case PRED: {
       std::uniform_int_distribution<int> generator(0, 1);
@@ -208,16 +234,12 @@
 
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
-std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
-    const Shape& input_shape, const Shape& slice_shape,
-    std::minstd_rand0* engine) {
-  const int64 rank = ShapeUtil::Rank(input_shape);
-  std::vector<int32> start_indices(rank);
+std::unique_ptr<Literal> MakeRandomIndex(
+    tensorflow::gtl::ArraySlice<int64> index_space, std::minstd_rand0* engine) {
+  std::vector<int32> start_indices(index_space.size());
   if (engine != nullptr) {
-    for (int i = 0; i < rank; ++i) {
-      const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
-                                ShapeUtil::GetDimension(slice_shape, i);
-      std::uniform_int_distribution<int32> generator(0, upper_bound);
+    for (int i = 0; i < index_space.size(); ++i) {
+      std::uniform_int_distribution<int32> generator(0, index_space[i]);
       start_indices[i] = generator(*engine);
     }
   }
@@ -254,6 +276,11 @@
         auto converted_uses = FindConstrainedUses(dataflow, *instruction);
         constrained_uses.insert(constrained_uses.end(), converted_uses.begin(),
                                 converted_uses.end());
+      } else if (opcode == HloOpcode::kSort &&
+                 instruction->operand_count() == 2 && op_num == 0) {
+        // Operand 0 of sort is the array of keys used for key/value
+        // (two-operand) kSort instructions.
+        constrained_uses.push_back(instruction);
       }
     }
   }
@@ -267,56 +294,66 @@
 StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
-  HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_constant = nullptr;
+  std::vector<int64> index_space;
+  bool no_duplicates = false;
+  bool needs_constant = false;
   ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-        if (needs_index != nullptr) {
-          auto needs_index_shape = needs_index->shape();
-          auto use_shape = use->shape();
-          if (needs_index->opcode() == HloOpcode::kDynamicSlice) {
-            needs_index_shape = needs_index->operand(0)->shape();
+      case HloOpcode::kDynamicUpdateSlice: {
+        const Shape& indexed_shape = use->operand(0)->shape();
+        const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice
+                                       ? use->shape()
+                                       : use->operand(1)->shape();
+        const int64 rank = ShapeUtil::Rank(indexed_shape);
+        if (!index_space.empty()) {
+          TF_RET_CHECK(rank == index_space.size());
+          for (int64 i = 0; i < rank; ++i) {
+            index_space[i] = std::min(
+                index_space[i], ShapeUtil::GetDimension(indexed_shape, i) -
+                                    ShapeUtil::GetDimension(slice_shape, i));
           }
-          if (use->opcode() == HloOpcode::kDynamicSlice) {
-            use_shape = use->operand(0)->shape();
-          }
-          if (!ShapeUtil::Equal(needs_index_shape, use_shape)) {
-            return Unimplemented(
-                "Conflicting operand generation slice index constraints\n");
+        } else {
+          index_space.resize(rank);
+          for (int64 i = 0; i < rank; ++i) {
+            index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) -
+                             ShapeUtil::GetDimension(slice_shape, i);
           }
         }
-        needs_index = use;
         break;
+      }
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
-        needs_constant = use;
+        needs_constant = true;
         constant_type = GetInitValue(*use->to_apply());
         break;
 
       case HloOpcode::kSelectAndScatter:
-        needs_constant = use;
+        needs_constant = true;
         constant_type = GetInitValue(*use->scatter());
         break;
 
+      case HloOpcode::kSort:
+        no_duplicates = true;
+        break;
+
       default:
         return Unimplemented(
             "Constrained operand generation not implemented for %s.",
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_constant != nullptr) {
-    return Unimplemented(
-        "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "constant: %s\n",
-        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
+  int constraint_count = 0;
+  constraint_count += no_duplicates ? 1 : 0;
+  constraint_count += !index_space.empty() ? 1 : 0;
+  constraint_count += needs_constant ? 1 : 0;
+  if (constraint_count > 1) {
+    return Unimplemented("Conflicting operand generation constraints.");
   }
-  if (needs_index != nullptr) {
-    return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
-                                           needs_index->shape(), engine);
-  } else if (needs_constant != nullptr) {
+  if (!index_space.empty()) {
+    return MakeRandomIndex(index_space, engine);
+  } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
         return LiteralUtil::Zero(param.shape().element_type()).CloneToUnique();
@@ -325,10 +362,11 @@
       case ConstantType::kUnknown:
         // We want the identity element for the computation, but we don't really
         // know what it is - so any value we generate will be just as wrong.
-        return MakeFakeLiteralInternal(param.shape(), engine);
+        return MakeFakeLiteralInternal(param.shape(), engine,
+                                       /*no_duplicates=*/false);
     }
   } else {
-    return MakeFakeLiteralInternal(param.shape(), engine);
+    return MakeFakeLiteralInternal(param.shape(), engine, no_duplicates);
   }
 }
 
@@ -346,18 +384,23 @@
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
                                                    bool pseudo_random) {
   auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
-  return MakeFakeLiteralInternal(shape, engine.get());
+  return MakeFakeLiteralInternal(shape, engine.get(), /*no_duplicates=*/false);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module, bool pseudo_random) {
+  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
+  return MakeFakeArguments(module, engine.get());
+}
+
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module, std::minstd_rand0* engine) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(*module));
   const auto params = module->entry_computation()->parameter_instructions();
-  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(arguments[i], MakeConstrainedArgument(
-                                          *dataflow, *params[i], engine.get()));
+    arguments[i] =
+        MakeConstrainedArgument(*dataflow, *params[i], engine).ValueOrDie();
   }
   return std::move(arguments);
 }
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index e59f215..3a8ad80 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -63,8 +63,17 @@
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
 //
-// Will handle special cases such as making sure that indices used for dynamic
-// slices are bounded, reduces that call adds use 0 as an init value, etc.
+// A best-effort attempt is made to generate the data in a way which produce
+// stable computation results across platforms. Specifically:
+//
+//  (1) Init values of reductions should be the identity of the reduction
+//  computation.
+//
+//  (2) Indices of dynamic slices and update slices should be in bounds.
+//
+//  (3) Keys of key/value sorts should contain no duplicates.
+//
+// These constraints are best-effort only.
 //
 // If pseudo_random is true, the generated numbers will be generated
 // deterministically in a pseudo random way unless the values are constrated to
@@ -78,6 +87,12 @@
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module, bool pseudo_random = true);
 
+// Overload which accepts a random number generator. This enables generation of
+// different random values with sequential calls to MakeFakeArguments by reusing
+// the same generator.
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module, std::minstd_rand0* engine);
+
 // Check that a given module satisfies various constraints before trying to
 // execute it.
 Status VerifyHloModule(HloModule* const module,
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index a2f0338..322c8ef 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -20,6 +20,7 @@
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -72,5 +73,106 @@
   TF_ASSERT_OK(MakeFakeArguments(module.get()).status());
 }
 
+XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
+  auto module = ParseHloString(
+                    R"(HloModule index_space_module
+
+    ENTRY IndexSpace {
+      index_param = s32[3]{0} parameter(0)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
+      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3}
+      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2}
+    })")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 3);
+  const Literal& index_arg = *args[0];
+
+  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+
+  EXPECT_GE(index_arg.Get<int32>({1}), 0);
+  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+
+  EXPECT_GE(index_arg.Get<int32>({2}), 0);
+  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+}
+
+XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
+  auto module = ParseHloString(
+                    R"(HloModule index_space_module
+
+    ENTRY IndexSpace {
+      index_param = s32[3]{0} parameter(0)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
+      update_param.1 = f32[1,2,3]{0,1,2} parameter(3)
+      update_param.2 = f32[3,2,2]{0,1,2} parameter(4)
+
+      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param)
+      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param)
+    })")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 5);
+  const Literal& index_arg = *args[0];
+
+  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+
+  EXPECT_GE(index_arg.Get<int32>({1}), 0);
+  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+
+  EXPECT_GE(index_arg.Get<int32>({2}), 0);
+  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+}
+
+XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort.148.1589
+
+ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) {
+  %parameter.0 = f32[1048576]{0} parameter(0)
+  %parameter.1 = s32[1048576]{0} parameter(1)
+  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = *args[0];
+
+  tensorflow::gtl::FlatSet<uint32> key_set;
+  for (const float& value : key_arg.data<float>()) {
+    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort.148.1589
+
+ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) {
+  %parameter.0 = s32[1048576]{0} parameter(0)
+  %parameter.1 = s32[1048576]{0} parameter(1)
+  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = *args[0];
+
+  tensorflow::gtl::FlatSet<int32> key_set;
+  for (const int32& value : key_arg.data<int32>()) {
+    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 2fd70b7..97bbf80 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -586,9 +586,9 @@
           }));
   auto expected =
       LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({2, 3}));
-  auto literal = MakeUnique<Literal>();
+  auto literal = Literal::CreateFromShape(expected->shape());
   TF_EXPECT_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
-      backend().default_stream_executor(), expected->shape(), literal.get()));
+      backend().default_stream_executor(), expected->shape(), *literal));
   EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *literal));
 }
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index be4cf43..b477423 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -223,9 +223,13 @@
                                      const Options& opts) {
   tensorflow::Env* env = tensorflow::Env::Default();
   HloSnapshot snapshot;
-  if (tensorflow::ReadBinaryProto(env, filename, &snapshot).ok()) {
+  auto s = tensorflow::ReadBinaryProto(env, filename, &snapshot);
+  if (s.ok()) {
     return snapshot;
   }
+  if (s.code() == tensorflow::error::NOT_FOUND) {
+    return s;
+  }
   CHECK(opts.use_fake_data)
       << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto "
          "and textual HLO don't carry real data.";
@@ -258,6 +262,9 @@
     StatusOr<HloSnapshot> maybe_snapshot = ParseInputFile(arg, opts);
     if (maybe_snapshot.ok()) {
       snapshots.push_back(std::move(maybe_snapshot).ValueOrDie());
+    } else {
+      LOG(ERROR) << "Can't handle file " << arg << ": "
+                 << maybe_snapshot.status();
     }
   }
 
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 10c0adc..b53f89d 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -104,15 +104,6 @@
   // interpretation of this value is left to the backends.
   int32 xla_backend_optimization_level = 31;
 
-  // When true, "unsafe" mathematical optimizations are enabled. These
-  // transformations include but are not limited to:
-  //
-  //  - Reducing the precision of operations (e.g. using an approximate sin
-  //    function, or transforming x/y into x * (1/y)).
-  //  - Assuming that operations never produce or consume NaN or +/- Inf.
-  //  - Assuming that +0 and -0 are indistinguishable.
-  bool xla_enable_fast_math = 32;
-
   // Embed the compiler IR as a string in the executable.
   bool xla_embed_ir_in_executable = 33;
 
@@ -194,8 +185,23 @@
   // Maximum kernel unroll factor for the GPU backend.
   int32 xla_gpu_max_kernel_unroll_factor = 98;
 
-  // Extra options to pass to the compilation backend; specific interpretation
-  // of these values is left to the backend.
+  // When true, "unsafe" mathematical optimizations are enabled. These
+  // transformations include but are not limited to:
+  //
+  //  - Reducing the precision of operations (e.g. using an approximate sin
+  //    function, or transforming x/y into x * (1/y)).
+  //  - Assuming that operations never produce or consume NaN or +/- Inf.
+  //  - Assuming that +0 and -0 are indistinguishable.
+  bool xla_cpu_enable_fast_math = 99;
+  bool xla_gpu_enable_fast_math = 100;
+
+  // Crashes the program when any kind of verification fails, instead of just
+  // logging the failures. One example is cross checking of convolution results
+  // among different algorithms.
+  bool xla_gpu_crash_on_verification_failures = 101;
+
+  // Extra options to pass to the compilation backend (e.g. LLVM); specific
+  // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
 }
 
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index fd784e9..27aa94c 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -424,25 +424,25 @@
   // "Window indices" is a term for a set of indices that index into the
   // interior of a dynamic-slice from the input tensor, the starting indices for
   // which were computed from output_gather_dims (see the operation semantic for
-  // how this is defined) and the gather_indices tensor.
+  // how this is defined) and the start_indices tensor.
   //
   // The window indices for a specific output index Out is computed as:
   //
   //  i = 0
   //  for (k : [0, input_tensor_shape.rank))
   //    window_indices[k] =
-  //      if k in elided_window_dims
+  //      if k in collapsed_slice_dims
   //      then 0
-  //      else Out[output_window_dims[i++]]
-  repeated int64 output_window_dims = 1;
-  repeated int64 elided_window_dims = 2;
+  //      else Out[offset_dims[i++]]
+  repeated int64 offset_dims = 1;
+  repeated int64 collapsed_slice_dims = 2;
 
-  // This is interpreted as a map from i to gather_dims_to_operand_dims[i]. It
-  // transforms the gather index looked up from the gather_indices tensor into
+  // This is interpreted as a map from i to start_index_map[i]. It
+  // transforms the gather index looked up from the start_indices tensor into
   // the starting index in the input space.
-  repeated int64 gather_dims_to_operand_dims = 3;
+  repeated int64 start_index_map = 3;
 
-  // The dimension in the gather_indices input that contains the starting
+  // The dimension in the start_indices input that contains the starting
   // indices.
   int64 index_vector_dim = 4;
 }
@@ -561,3 +561,11 @@
   // to.
   repeated OpSharding tuple_shardings = 5;
 }
+
+// Describes the replica groups in a cross replica op (e.g., all-reduce and
+// all-to-all).
+message ReplicaGroup {
+  // The ids of the replicas that belongs to the same group. The ordering of the
+  // ids matters in some op (e.g., all-to-all).
+  repeated int64 replica_ids = 1;
+}
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index cc34db9..23bb783 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -46,6 +46,7 @@
         "//tensorflow/contrib/gan",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
+        "//tensorflow/contrib/hadoop",
         "//tensorflow/contrib/hooks",
         "//tensorflow/contrib/image:distort_image_py",
         "//tensorflow/contrib/image:image_py",
@@ -146,6 +147,7 @@
         "//tensorflow/contrib/coder:all_kernels",
         "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
+        "//tensorflow/contrib/hadoop:dataset_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_kernels",
@@ -181,6 +183,7 @@
         "//tensorflow/contrib/data:dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
+        "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index ded05da..e18ea8d 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -22,6 +22,7 @@
 import os
 
 # Add projects here, they will show up under tf.contrib.
+from tensorflow.contrib import autograph
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import checkpoint
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 159d985..3b53973 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -32,10 +32,10 @@
   """Check tensors for isomorphism and flatten.
 
   Args:
-    tensors: list of T @{tf.Tensor} which must all have the same shape.
+    tensors: list of T `tf.Tensor` which must all have the same shape.
 
   Returns:
-    tensors: a list of T @{tf.Tensor} which are flattened (1D) views of tensors
+    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
     shape: the original shape of each element of input tensors
 
   Raises:
@@ -61,12 +61,12 @@
   """Reshape tensors flattened by _flatten_tensors.
 
   Args:
-    tensors: list of T @{tf.Tensor} of identical length 1D tensors.
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
     shape: list of integers describing the desired shape.  Product of
       the elements must equal the length of each tensor.
 
   Returns:
-    list of T @{tf.Tensor} which are the reshaped inputs.
+    list of T `tf.Tensor` which are the reshaped inputs.
   """
   reshaped = []
   for t in tensors:
@@ -79,12 +79,12 @@
   """Like split for 1D tensors but pads-out case where len % pieces != 0.
 
   Args:
-    tensor: T @{tf.Tensor} that must be 1D.
+    tensor: T `tf.Tensor` that must be 1D.
     pieces: a positive integer specifying the number of pieces into which
       tensor should be split.
 
   Returns:
-    list of T @{tf.Tensor} of length pieces, which hold the values of
+    list of T `tf.Tensor` of length pieces, which hold the values of
       thin input tensor, in order.  The final tensor may
       be zero-padded on the end to make its size equal to those of all
       of the other tensors.
@@ -132,11 +132,11 @@
   """Strip the suffix padding added by _padded_split.
 
   Args:
-    tensors: list of T @{tf.Tensor} of identical length 1D tensors.
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
     pad_len: number of elements to be stripped from the end of each tensor.
 
   Returns:
-    list of T @{tf.Tensor} which are the stripped inputs.
+    list of T `tf.Tensor` which are the stripped inputs.
 
   Raises:
     ValueError: tensors must be a non-empty list of 1D tensors, and
@@ -161,12 +161,12 @@
   """Like split for 1D tensors but allows case where len % pieces != 0.
 
   Args:
-    tensor: T @{tf.Tensor} that must be 1D.
+    tensor: T `tf.Tensor` that must be 1D.
     pieces: a positive integer specifying the number of pieces into which
       tensor should be split.
 
   Returns:
-    list of T @{tf.Tensor} of length pieces, which hold the values of
+    list of T `tf.Tensor` of length pieces, which hold the values of
       the input tensor, in order.  The final tensor may be shorter
       than the others, which will all be of equal length.
 
@@ -256,7 +256,7 @@
   """Construct a subgraph performing a ring-style all-reduce of input_tensors.
 
   Args:
-    input_tensors: a list of T @{tf.Tensor} objects, which must all
+    input_tensors: a list of T `tf.Tensor` objects, which must all
       have the same shape and type.
     num_workers: number of worker tasks spanned by input_tensors.
     num_subchunks: number of subchunks each device should process in one tick.
@@ -272,7 +272,7 @@
     size.
 
   Returns:
-    a list of T @{tf.Tensor} identical sum-reductions of input_tensors.
+    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
   """
   if len(input_tensors) < 2:
     raise ValueError("input_tensors must be length 2 or longer")
@@ -299,7 +299,7 @@
   """Construct a subgraph for the first (reduction) pass of ring all-reduce.
 
   Args:
-    input_tensors: a list of T @{tf.Tensor} 1D input tensors of same
+    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
       shape and type.
     devices: array of device name strings
     num_subchunks: number of subchunks each device should process in one tick.
@@ -311,7 +311,7 @@
     ValueError: tensors must all be one dimensional.
 
   Returns:
-    list of list of T @{tf.Tensor} of (partially) reduced values where
+    list of list of T `tf.Tensor` of (partially) reduced values where
     exactly num_subchunks chunks at each device are fully reduced.
   """
   num_devices = len(input_tensors)
@@ -360,11 +360,11 @@
   """Apply a unary op to each tensor in chunks_by_dev, on same device.
 
   Args:
-    f: a unary function over T @{tf.Tensor}.
-    chunks_by_dev: list of lists of T @{tf.Tensor}.
+    f: a unary function over T `tf.Tensor`.
+    chunks_by_dev: list of lists of T `tf.Tensor`.
 
   Returns:
-    new list of lists of T @{tf.Tensor} with the same structure as
+    new list of lists of T `tf.Tensor` with the same structure as
     chunks_by_dev containing the derived tensors.
   """
   output = []
@@ -381,14 +381,14 @@
   Args:
     pred_by_s_d: as produced by _ring_permutations
     rank_by_s_d: as produced by _ring_permutations
-    chunks_by_dev: list of list of T @{tf.Tensor} indexed by ints
+    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
       (device, chunk)
 
   Raises:
     ValueError: chunks_by_dev is not well-formed
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced tensors, one
+    list of T `tf.Tensor` which are the fully reduced tensors, one
     at each device corresponding to the outer dimension of chunks_by_dev.
   """
   num_devices = len(chunks_by_dev)
@@ -448,12 +448,12 @@
     the future with edge-case specific logic.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} to be elementwise reduced.
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
     red_op: a binary elementwise reduction Op.
     un_op: an optional unary elementwise Op to apply to reduced values.
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced tensors, one
+    list of T `tf.Tensor` which are the fully reduced tensors, one
     at each device of input_tensors.
 
   Raises:
@@ -475,13 +475,13 @@
   """Construct the gather phase of recursive halving-doubling all-reduce.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} to be elementwise reduced.
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
     devices: a list of strings naming the devices hosting input_tensors,
       which will also be used to host the (partial) reduction values.
     red_op: a binary elementwise reduction Op.
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced tensor shards.
+    list of T `tf.Tensor` which are the fully reduced tensor shards.
 
   Raises:
     ValueError: num_devices not a power of 2, or tensor len not divisible
@@ -516,12 +516,12 @@
   """Construct the scatter phase of recursive halving-doublng all-reduce.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} that are fully-reduced shards.
+    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
     devices: a list of strings naming the devices on which the reconstituted
       full tensors should be placed.
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced tensors.
+    list of T `tf.Tensor` which are the fully reduced tensors.
   """
   num_devices = len(devices)
   num_hops = int(math.log(num_devices, 2))
@@ -571,7 +571,7 @@
     un_op: optional elementwise unary Op to be applied to fully-reduced values.
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced tensors.
+    list of T `tf.Tensor` which are the fully reduced tensors.
   """
   input_tensors, shape = _flatten_tensors(input_tensors)
   dst_devices = [t.device for t in input_tensors]
@@ -594,7 +594,7 @@
     un_op: optional elementwise unary Op to be applied to fully-reduced values.
 
   Returns:
-    list of T @{tf.Tensor} which are the fully reduced shards.
+    list of T `tf.Tensor` which are the fully reduced shards.
 
   Raises:
     ValueError: inputs not well-formed.
@@ -629,7 +629,7 @@
       should be reconstituted.
 
   Returns:
-    list of T @{tf.Tensor} scattered tensors.
+    list of T `tf.Tensor` scattered tensors.
   """
   num_devices = len(dst_devices)
   out_tensors = []
@@ -644,7 +644,7 @@
 
   Args:
     devices: list of device name strings
-    values: list of T @{tf.tensor} of same length as devices.
+    values: list of T `tf.tensor` of same length as devices.
 
   Returns:
     (per_task_devices, per_task_values) where both values are
@@ -680,14 +680,14 @@
   """Build a subgraph that does one full all-reduce, using NCCL.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
       be reduced.
     red_op: binary elementwise reduction operator.  Must be one of
       {tf.add}
     un_op: optional unary elementwise Op to apply to fully-reduce values.
 
   Returns:
-    list of T @{tf.Tensor} of reduced values.
+    list of T `tf.Tensor` of reduced values.
 
   Raises:
     ValueError: red_op not supported.
@@ -709,14 +709,14 @@
   """Construct a subgraph for NCCL hybrid all-reduce.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
       be reduced.
     red_op: binary elementwise reduction operator.
     upper_level_f: function for reducing one value per worker, across
       workers.
 
   Returns:
-    list of T @{tf.Tensor} of reduced values.
+    list of T `tf.Tensor` of reduced values.
 
   Raises:
     ValueError: inputs not well-formed.
@@ -797,7 +797,7 @@
   """Construct a subgraph for Shuffle hybrid all-reduce.
 
   Args:
-    input_tensors: list of T @{tf.Tensor} of same-shape and type values to
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
       be reduced.
     gather_devices: list of device names on which to host gather shards.
     red_op: binary elementwise reduction operator.
@@ -805,7 +805,7 @@
       workers.
 
   Returns:
-    list of T @{tf.Tensor} of reduced values.
+    list of T `tf.Tensor` of reduced values.
 
   Raises:
     ValueError: inputs not well-formed.
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 7cbba71..2d2ab70 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -204,6 +204,7 @@
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":converters",
         "//tensorflow/contrib/autograph/core:test_lib",
diff --git a/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md b/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md
new file mode 100644
index 0000000..bcbb920
--- /dev/null
+++ b/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md
@@ -0,0 +1,33 @@
+# Specifying return data type for `py_func` calls
+
+The `py_func` op requires specifying a
+[data type](https://www.tensorflow.org/guide/tensors#data_types).
+
+When wrapping a function with `py_func`, for instance using
+`@autograph.do_not_convert(run_mode=autograph.RunMode.PY_FUNC)`, you have two
+options to specify the returned data type:
+
+ * explicitly, with a specified `tf.DType` value
+ * by matching the data type of an input argument, which is then assumed to be
+     a `Tensor`
+
+Examples:
+
+Specify an explicit data type:
+
+```
+  def foo(a):
+    return a + 1
+
+  autograph.util.wrap_py_func(f, return_dtypes=[tf.float32])
+```
+
+Match the data type of the first argument:
+
+```
+  def foo(a):
+    return a + 1
+
+  autograph.util.wrap_py_func(
+      f, return_dtypes=[autograph.utils.py_func.MatchDType(0)])
+```
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index 4729c73..276a387 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Public API."""
+"""This module contains the user-facing API for AutoGraph."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -42,34 +42,30 @@
 # (currently we require (module + class name, type))
 
 
-def convert(recursive=False, verbose=False, arg_types=None):
-  """Decorator that compiles a function to graph mode.
+# TODO(mdan): This should behave like to_graph (e.g. convert statically).
+def convert(recursive=False, verbose=False):
+  """Decorator that compiles a function to use TensorFlow ops.
 
-  The decorator is dynamic - invoking compilation whenever the decorated
-  function is called. This means the parameter values are known at compilation.
+  The decorator is dynamic - it recompiles the target whenever the decorated
+  function is called. This means the parameter values are known at conversion.
+  It also means that repeated calls with different types of parameters will be
+  correctly processed.
 
   Args:
-    recursive: Whether to recursively convert any functions that the decorator
-        function may call.
-    verbose: Whether to output the compiled code in the logs.
-    arg_types: See to_graph.
+    recursive: bool, whether to recursively convert any functions or classes
+        that the converted function may use.
+    verbose: bool, whether to output the compiled code in the logs.
 
   Returns:
-    A decorator that compiles the given function to graph mode.
-
-  Raises:
-    ValueError: If any of the arguments are illegal.
+    Callable, a decorator that converts the given function into an equivalent
+    function that uses TensorFlow ops.
   """
-  if arg_types is None:
-    arg_types = {}
-
   def decorator(f):
     """Decorator implementation."""
 
     @wraps(f)
     def wrapper(*args, **kwargs):
-      return converted_call(f, recursive, verbose, True, arg_types, *args,
-                            **kwargs)
+      return converted_call(f, recursive, verbose, True, {}, *args, **kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
@@ -82,22 +78,34 @@
 
 
 class RunMode(Enum):
+  """Specifies the way a converted function or method should be executed in TF.
+
+  The enum values have the following semantics:
+
+   * GRAPH: Call this function directly, as-is. This is suitable for functions
+       that were already designed for TF graphs and contain ops.
+   * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
+       that will only run correctly in Python, for example code that renders
+       to the display, reads keyboard input, etc.
+  """
   GRAPH = 1
   PY_FUNC = 2
 
 
 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
-  """Decorator that suppresses compilation of a function.
+  """Decorator that suppresses the conversion of a function.
+
+  See also: docs/pyfunc_dtypes.md
 
   Args:
-    run_as: RunMode value. Whether to run the function as-is, or wrap it into
-        a py_func.
-    return_dtypes: See autograph.utils.py_func.wrap_py_func. Setting to None or
-        empty list or tuple will create a dummy return value that can be used
-        to set control dependencies.
+    run_as: RunMode, specifies how to use the function in TensorFlow.
+    return_dtypes: Optional[Iterable[
+        Union[tf.DType, utils.py_func.MatchDType]]], the return data types of
+        the converted function, if run_as is RunMode.PY_FUNC. Ignored otherwise.
+        May be set to None if the function has no return values.
 
   Returns:
-    A decorator that wraps the original function.
+    Callable, a decorator that wraps the original function.
   """
 
   def decorator(f):
@@ -130,9 +138,10 @@
   return decorator
 
 
+# TODO(mdan): Move to a private, undocumented module.
 def converted_call(f, recursive, verbose, force_conversion, arg_types, *args,
                    **kwargs):
-  """Compiles a function call inline."""
+  """Compiles a function call inline. For internal use only."""
   # TODO(mdan): This needs cleanup.
   # In particular, we may want to avoid renaming functions altogether.
   if not force_conversion and conversion.is_whitelisted_for_graph(f):
@@ -202,39 +211,41 @@
   return converted_f(*effective_args, **kwargs)
 
 
+# TODO(mdan): Rename: to_ops?
+# TODO(mdan): Looki into overloading as function and decorator, like tfe.defun.
+# TODO(mdan): Remove partial_types.
 def to_graph(e,
              recursive=True,
              verbose=False,
              arg_values=None,
              arg_types=None,
              partial_types=None):
-  """Compile a Python entity into equivalent TensorFlow code.
+  """Converts a Python entity into equivalent code that uses TensorFlow ops.
 
-  Currently supported entities:
+  Supported Python entities include:
     * functions
     * classes
 
-  Classes are handled by converting all their methods into a new class.
+  Classes are converted by converting all their methods into a new class.
 
   Args:
-    e: A Python entity.
-    recursive: Whether to recursively convert any functions that the decorator
-        function may call.
-    verbose: Whether to output the compiled code in the logs.
-    arg_values: A dict containing value hints for symbols like function
-        parameters.
-    arg_types: A dict containing type hints for symbols like function
-        parameters.
-    partial_types: A set of types (e.g. classes) that will not be converted
-        entirely. Calls to member functions for these types will be renamed
-        independently.
+    e: Union[Callable, Type], the Python entity to convert.
+    recursive: bool, whether to recursively convert any functions that the
+        converted function may call.
+    verbose: bool, whether to output the compiled code in the logs.
+    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
+        function arguments.
+    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
+        function arguments.
+    partial_types: Set[Type], reserved for internal use.
 
   Returns:
-    A function with a signature identical to `o`, but which when executed it
-    creates TF a graph that has the same functionality as the original entity.
+    Union[Callable, Type], the converted entity, which is the same kind as e
+    (that is, a function is e is a function, a class if e is a class, etc.) but
+    its code has been converted to use TF ops.
+
   Raises:
-    ValueError: If the converted function defines or refers to symbol names that
-    are reserved for AutoGraph.
+    ValueError: If the entity could not be converted.
   """
   program_ctx = converter.ProgramContext(
       recursive=recursive,
@@ -288,20 +299,23 @@
             arg_types=None,
             partial_types=None,
             indentation='  '):
-  """Return the equivalent of an entity in TensorFlow code.
+  """Returns the equivalent code that uses TensorFlow ops.
 
-  See `to_graph` for more details.
+  Also see: `to_graph`, `convert`
 
   Args:
-    e: A Python entity.
-    recursive: See to_graph.
-    arg_values: See to_graph.
-    arg_types: See to_graph.
-    partial_types: See to_graph.
-    indentation: String, when to use for each level of indentation.
+    e: Union[Callable, Type], the Python entity to convert.
+    recursive: bool, whether to recursively convert any functions that the
+        converted function may call.
+    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
+        function arguments.
+    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
+        function arguments.
+    partial_types: Set[Type], reserved for internal use.
+    indentation: Text, when to use for each level of indentation.
 
   Returns:
-    String.
+    Text, the converted code.
   """
   program_ctx = converter.ProgramContext(
       recursive=recursive,
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index be38d3f..9909e52 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -141,7 +141,7 @@
         while_body,
         init_state=(epoch_number, iterate) + init_state,
         extra_deps=())
-  # Dropping the epoch number and iterate because they are not not syntactically
+  # Dropping the epoch number and iterate because they are not syntactically
   # visible.
   results = results[2:]
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py
index 9a84f12..7f2b379 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py
@@ -39,7 +39,7 @@
 class Definition(object):
   """Definition objects describe a unique definition of a variable.
 
-  Subclasses of this may be used by passing an appropriate factory fuction to
+  Subclasses of this may be used by passing an appropriate factory function to
   resolve.
 
   Attributes:
diff --git a/tensorflow/contrib/autograph/pyct/testing/BUILD b/tensorflow/contrib/autograph/pyct/testing/BUILD
index 957db35..9ef1ac9 100644
--- a/tensorflow/contrib/autograph/pyct/testing/BUILD
+++ b/tensorflow/contrib/autograph/pyct/testing/BUILD
@@ -33,7 +33,10 @@
     size = "large",
     srcs = ["codegen_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomsan",
+    ],
     deps = [
         ":testing",
         "//tensorflow/contrib/autograph/pyct",
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index ccbe5fc..4dd440e 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -44,6 +44,8 @@
     return dynamic_int(*args, **kwargs)
   if f is float:
     return dynamic_float(*args, **kwargs)
+  if f is abs:
+    return dynamic_abs(*args, **kwargs)
 
   raise NotImplementedError(
       'The "%s" builtin is not yet supported.' % f.__name__)
@@ -81,6 +83,13 @@
   return float(num_or_tensor)
 
 
+def dynamic_abs(num_or_tensor, **kwargs):
+  if tensor_util.is_tensor(num_or_tensor):
+    return math_ops.abs(num_or_tensor, **kwargs)
+  else:
+    return abs(num_or_tensor, **kwargs)
+
+
 def dynamic_range(start_or_stop, stop=None, step=None):
   """Implementation of range using dynamic dispatch."""
   if type_check.is_tensor(start_or_stop, stop, step):
diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
index b4821f3..b1cd525 100644
--- a/tensorflow/contrib/autograph/utils/builtins_test.py
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -44,6 +44,23 @@
     with self.test_session() as sess:
       self.assertEqual(3, sess.run(builtins.dynamic_builtin(len, a)))
 
+  def test_dynamic_abs_tf_scalar(self):
+    a = constant_op.constant(-1)
+
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(builtins.dynamic_builtin(abs, a)))
+
+  def test_dynamic_abs_tf_array(self):
+    a = constant_op.constant([-1, 2, -3])
+
+    with self.test_session() as sess:
+      self.assertListEqual([1, 2, 3],
+                           list(sess.run(builtins.dynamic_builtin(abs, a))))
+
+  def test_dynamic_abs_py_scalar(self):
+    a = -1
+    self.assertEqual(1, builtins.dynamic_builtin(abs, a))
+
   def test_dynamic_len_tf_matrix(self):
     a = constant_op.constant([[1, 2], [3, 4]])
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 68ead2f..9afe3df 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Monte Carlo integration and helpers.
 
-See the @{$python/contrib.bayesflow.monte_carlo} guide.
-
 @@expectation
 @@expectation_importance_sampler
 @@expectation_importance_sampler_logspace
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index 88a3909..b9abfa8 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -1,4 +1,4 @@
-# Bigtable #
+# Google Cloud Bigtable
 
 [Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
 performance storage system that can store and serve training data. This contrib
@@ -13,7 +13,7 @@
 general-purpose Cloud Bigtable
 APIs, see the [official Cloud Bigtable client library documentation][clientdoc].
 
-[clientdoc]:  https://cloud.google.com/bigtable/docs/reference/libraries
+[clientdoc]: https://cloud.google.com/bigtable/docs/reference/libraries
 
 ## Sample Use
 
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index a6755a3..a25a641 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -84,6 +84,8 @@
                 channel_args.SetMaxReceiveMessageSize(
                     max_receive_message_size_);
                 channel_args.SetUserAgentPrefix("tensorflow");
+                channel_args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 0);
+                channel_args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 60 * 1000);
                 client_options.set_channel_arguments(channel_args);
                 std::shared_ptr<google::cloud::bigtable::DataClient> client =
                     google::cloud::bigtable::CreateDefaultDataClient(
@@ -216,11 +218,11 @@
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset), done);
 
-      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(&iter_ctx, "ToBigtableOpIterator", &iterator),
+          dataset->MakeIterator(IteratorContext(ctx), "ToBigtableOpIterator",
+                                &iterator),
           done);
 
       int64 timestamp_int;
@@ -243,9 +245,10 @@
         ::google::cloud::bigtable::BulkMutation mutation;
         // TODO(saeta): Make # of mutations configurable.
         for (uint64 i = 0; i < 100 && !end_of_sequence; ++i) {
-          OP_REQUIRES_OK_ASYNC(
-              ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-              done);
+          OP_REQUIRES_OK_ASYNC(ctx,
+                               iterator->GetNext(IteratorContext(ctx),
+                                                 &components, &end_of_sequence),
+                               done);
           if (!end_of_sequence) {
             OP_REQUIRES_OK_ASYNC(
                 ctx,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
index 9e49fa3..bd32672 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
@@ -53,7 +53,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
                      BigtableTableResource* table,
@@ -61,7 +61,7 @@
                      std::vector<string> columns,
                      const DataTypeVector& output_types,
                      std::vector<PartialTensorShape> output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           table_(table),
           column_families_(std::move(column_families)),
@@ -80,8 +80,8 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtableLookupDataset")}));
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::BigtableLookup")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -96,6 +96,14 @@
       return "BigtableLookupDatasetOp::Dataset";
     }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     static ::google::cloud::bigtable::Filter MakeFilter(
         const std::vector<string>& column_families,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
index e960719..a803fdc 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc
@@ -35,11 +35,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table,
                      string prefix)
-        : GraphDatasetBase(ctx), table_(table), prefix_(std::move(prefix)) {
+        : DatasetBase(DatasetContext(ctx)),
+          table_(table),
+          prefix_(std::move(prefix)) {
       table_->Ref();
     }
 
@@ -47,8 +49,8 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtablePrefixKeyDataset")}));
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::BigtablePrefixKey")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -68,6 +70,14 @@
 
     BigtableTableResource* table() const { return table_; }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     class Iterator : public BigtableReaderDatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
index 96d3565..5cd0371 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc
@@ -39,11 +39,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table,
                      string start_key, string end_key)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           table_(table),
           start_key_(std::move(start_key)),
           end_key_(std::move(end_key)) {
@@ -54,8 +54,8 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtableRangeKeyDataset")}));
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::BigtableRangeKey")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -75,6 +75,14 @@
 
     BigtableTableResource* table() const { return table_; }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     class Iterator : public BigtableReaderDatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
index a1a63a9..6928d94 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
@@ -52,11 +52,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table,
                      string prefix, string start_key, string end_key)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           table_(table),
           key_range_(MakeMultiModeKeyRange(
               std::move(prefix), std::move(start_key), std::move(end_key))) {
@@ -68,7 +68,7 @@
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtableSampleKeyPairsDataset")}));
+          {this, strings::StrCat(prefix, "::BigtableSampleKeyPairs")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -87,6 +87,14 @@
       return "BigtableSampleKeyPairsDatasetOp::Dataset";
     }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     static MultiModeKeyRange MakeMultiModeKeyRange(string prefix,
                                                    string start_key,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
index a5a47cf..a759fb5 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc
@@ -31,10 +31,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table)
-        : GraphDatasetBase(ctx), table_(table) {
+        : DatasetBase(DatasetContext(ctx)), table_(table) {
       table_->Ref();
     }
 
@@ -43,7 +43,7 @@
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtableSampleKeysDataset")}));
+          {this, strings::StrCat(prefix, "::BigtableSampleKeys")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -63,6 +63,14 @@
 
     BigtableTableResource* table() const { return table_; }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
index 13cb868..78a920b 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc
@@ -84,7 +84,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table,
                      string prefix, string start_key, string end_key,
@@ -92,7 +92,7 @@
                      std::vector<string> columns, float probability,
                      const DataTypeVector& output_types,
                      std::vector<PartialTensorShape> output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           table_(table),
           prefix_(std::move(prefix)),
           start_key_(std::move(start_key)),
@@ -111,8 +111,8 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BigtableScanDataset")}));
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::BigtableScan")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -129,6 +129,14 @@
 
     BigtableTableResource* table() const { return table_; }
 
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
+    }
+
    private:
     class Iterator : public BigtableReaderDatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index e6ef513..3e1b622 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -17,8 +17,8 @@
 TensorFlow has support for reading from and writing to Cloud Bigtable. To use
 TensorFlow + Cloud Bigtable integration, first create a BigtableClient to
 configure your connection to Cloud Bigtable, and then create a BigtableTable
-object to allow you to create numerous @{tf.data.Dataset}s to read data, or
-write a @{tf.data.Dataset} object to the underlying Cloud Bigtable table.
+object to allow you to create numerous `tf.data.Dataset`s to read data, or
+write a `tf.data.Dataset` object to the underlying Cloud Bigtable table.
 
 For background on Cloud Bigtable, see: https://cloud.google.com/bigtable .
 """
@@ -203,7 +203,7 @@
         be retrieved. If end is None, all subsequent row keys will be retrieved.
 
     Returns:
-      A @{tf.data.Dataset} containing `tf.string` Tensors corresponding to all
+      A `tf.data.Dataset` containing `tf.string` Tensors corresponding to all
       of the row keys between `start` and `end`.
     """
     # TODO(saeta): Make inclusive / exclusive configurable?
@@ -219,7 +219,7 @@
         retrieved.
 
     Returns:
-      A @{tf.data.Dataset}. containing `tf.string` Tensors corresponding to all
+      A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all
       of the row keys matching that prefix.
     """
     return _BigtablePrefixKeyDataset(self, prefix)
@@ -228,11 +228,11 @@
     """Retrieves a sampling of row keys from the Bigtable table.
 
     This dataset is most often used in conjunction with
-    @{tf.contrib.data.parallel_interleave} to construct a set of ranges for
+    `tf.contrib.data.parallel_interleave` to construct a set of ranges for
     scanning in parallel.
 
     Returns:
-      A @{tf.data.Dataset} returning string row keys.
+      A `tf.data.Dataset` returning string row keys.
     """
     return _BigtableSampleKeysDataset(self)
 
@@ -272,7 +272,7 @@
         that are treated as the column qualifier (column name).
 
     Returns:
-      A @{tf.data.Dataset} returning the row keys and the cell contents.
+      A `tf.data.Dataset` returning the row keys and the cell contents.
 
     Raises:
       ValueError: If the configured probability is unexpected.
@@ -317,7 +317,7 @@
         that are treated as the column qualifier (column name).
 
     Returns:
-      A @{tf.data.Dataset} returning the row keys and the cell contents.
+      A `tf.data.Dataset` returning the row keys and the cell contents.
 
     Raises:
       ValueError: If the configured probability is unexpected.
@@ -335,7 +335,7 @@
     """Retrieves row (including values) from the Bigtable service at high speed.
 
     Rows with row-key prefixed by `prefix` will be retrieved. This method is
-    similar to `scan_prefix`, but by constrast performs multiple sub-scans in
+    similar to `scan_prefix`, but by contrast performs multiple sub-scans in
     parallel in order to achieve higher performance.
 
     Note: The dataset produced by this method is not deterministic!
@@ -373,7 +373,7 @@
         that are treated as the column qualifier (column name).
 
     Returns:
-      A @{tf.data.Dataset} returning the row keys and the cell contents.
+      A `tf.data.Dataset` returning the row keys and the cell contents.
 
     Raises:
       ValueError: If the configured probability is unexpected.
@@ -394,7 +394,7 @@
     """Retrieves rows (including values) from the Bigtable service.
 
     Rows with row-keys between `start` and `end` will be retrieved. This method
-    is similar to `scan_range`, but by constrast performs multiple sub-scans in
+    is similar to `scan_range`, but by contrast performs multiple sub-scans in
     parallel in order to achieve higher performance.
 
     Note: The dataset produced by this method is not deterministic!
@@ -435,7 +435,7 @@
         that are treated as the column qualifier (column name).
 
     Returns:
-      A @{tf.data.Dataset} returning the row keys and the cell contents.
+      A `tf.data.Dataset` returning the row keys and the cell contents.
 
     Raises:
       ValueError: If the configured probability is unexpected.
@@ -450,12 +450,12 @@
     """Writes a dataset to the table.
 
     Args:
-      dataset: A @{tf.data.Dataset} to be written to this table. It must produce
+      dataset: A `tf.data.Dataset` to be written to this table. It must produce
         a list of number-of-columns+1 elements, all of which must be strings.
         The first value will be used as the row key, and subsequent values will
         be used as cell values for the corresponding columns from the
         corresponding column_families and columns entries.
-      column_families: A @{tf.Tensor} of `tf.string`s corresponding to the
+      column_families: A `tf.Tensor` of `tf.string`s corresponding to the
         column names to store the dataset's elements into.
       columns: A `tf.Tensor` of `tf.string`s corresponding to the column names
         to store the dataset's elements into.
@@ -463,7 +463,7 @@
         Leave as None to use server-provided timestamps.
 
     Returns:
-      A @{tf.Operation} that can be run to perform the write.
+      A `tf.Operation` that can be run to perform the write.
 
     Raises:
       ValueError: If there are unexpected or incompatible types, or if the
@@ -502,7 +502,7 @@
       normalized_columns: The column families and column qualifiers to retrieve.
 
     Returns:
-      A @{tf.data.Dataset} representing the result of the parallel scan.
+      A `tf.data.Dataset` representing the result of the parallel scan.
     """
     if num_parallel_scans is None:
       num_parallel_scans = 50
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 68d710d..c155128 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -16,7 +16,10 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import tempfile
+import numpy as np
+
 from tensorflow.contrib.boosted_trees.estimator_batch import estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
@@ -26,6 +29,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -473,6 +477,63 @@
     classifier.evaluate(input_fn=_multiclass_train_input_fn, steps=1)
     classifier.predict(input_fn=_eval_input_fn)
 
+  def testWeightedCategoricalColumn(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    feature_columns = [
+        core_feature_column.weighted_categorical_column(
+            categorical_column=core_feature_column.
+            categorical_column_with_vocabulary_list(
+                key="word", vocabulary_list=["the", "cat", "dog"]),
+            weight_feature_key="weight")
+    ]
+
+    labels = np.array([[1], [1], [0], [0.]], dtype=np.float32)
+
+    def _make_input_fn():
+
+      def _input_fn():
+        features_dict = {}
+        # Sparse tensor representing
+        # example 0: "cat","the"
+        # examaple 1: "dog"
+        # example 2: -
+        # example 3: "the"
+        # Weights for the words are 5 - cat, 6- dog and 1 -the.
+        features_dict["word"] = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
+            values=constant_op.constant(
+                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            dense_shape=[4, 3])
+        features_dict["weight"] = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
+            values=[1., 5., 6., 1.],
+            dense_shape=[4, 3])
+        return features_dict, labels
+
+      return _input_fn
+
+    est = estimator.CoreGradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=feature_columns)
+
+    input_fn = _make_input_fn()
+    est.train(input_fn=input_fn, steps=100)
+    est.evaluate(input_fn=input_fn, steps=1)
+    est.predict(input_fn=input_fn)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 5b4be2f..1375fdd 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -125,6 +125,8 @@
     auto flat_values = values_tensor.flat<float>();
     for (int64 instance = 0; instance < num_values; ++instance) {
       const float value = flat_values(instance);
+      CHECK(!buckets_vector.empty())
+          << "Got empty buckets for feature " << feature_index;
       auto bucket_iter =
           std::lower_bound(buckets_vector.begin(), buckets_vector.end(), value);
       if (bucket_iter == buckets_vector.end()) {
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 401bec8..d9e7a0f 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,7 +34,9 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
+using boosted_trees::learner::ObliviousSplitInfo;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
@@ -158,6 +160,11 @@
     const Tensor* hessians_t;
     OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
 
+    const Tensor* weak_learner_type_t;
+    OP_REQUIRES_OK(context,
+                   context->input("weak_learner_type", &weak_learner_type_t));
+    const int32 weak_learner_type = weak_learner_type_t->scalar<int32>()();
+
     // Find the number of unique partitions before we allocate the output.
     std::vector<int32> partition_boundaries;
     partition_boundaries.push_back(0);
@@ -188,20 +195,59 @@
     tensorflow::TTypes<int32>::Vec output_partition_ids =
         output_partition_ids_t->vec<int32>();
 
-    Tensor* gains_t = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output("gains", TensorShape({num_elements}),
-                                          &gains_t));
+    // For a normal tree, we output a split per partition. For an oblivious
+    // tree, we output one split for all partitions of the layer
+    int32 size_output = num_elements;
+    if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE &&
+        num_elements > 0) {
+      size_output = 1;
+    }
 
+    Tensor* gains_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "gains", TensorShape({size_output}), &gains_t));
     tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
 
     Tensor* output_splits_t = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                "split_infos", TensorShape({num_elements}),
-                                &output_splits_t));
+    OP_REQUIRES_OK(context, context->allocate_output("split_infos",
+                                                     TensorShape({size_output}),
+                                                     &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+
+    if (num_elements == 0) {
+      return;
+    }
     SplitBuilderState state(context);
+    switch (weak_learner_type) {
+      case LearnerConfig::NORMAL_DECISION_TREE: {
+        ComputeNormalDecisionTree(
+            &state, normalizer_ratio, num_elements, partition_boundaries,
+            bucket_boundaries, partition_ids, bucket_ids, gradients_t,
+            hessians_t, &output_partition_ids, &gains, &output_splits);
+        break;
+      }
+      case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
+        ComputeObliviousDecisionTree(
+            &state, normalizer_ratio, num_elements, partition_boundaries,
+            bucket_boundaries, partition_ids, bucket_ids, gradients_t,
+            hessians_t, &output_partition_ids, &gains, &output_splits);
+        break;
+      }
+    }
+  }
+
+ private:
+  void ComputeNormalDecisionTree(
+      SplitBuilderState* state, const float normalizer_ratio,
+      const int num_elements, const std::vector<int32>& partition_boundaries,
+      const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
+      const tensorflow::TTypes<int32>::ConstVec& partition_ids,
+      const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
+      const Tensor* gradients_t, const Tensor* hessians_t,
+      tensorflow::TTypes<int32>::Vec* output_partition_ids,
+      tensorflow::TTypes<float>::Vec* gains,
+      tensorflow::TTypes<string>::Vec* output_splits) {
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[root_idx];
@@ -213,7 +259,7 @@
             GradientStats(*gradients_t, *hessians_t, bucket_idx);
       }
       root_gradient_stats *= normalizer_ratio;
-      NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
+      NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
       int32 best_bucket_idx = 0;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
@@ -223,10 +269,10 @@
         GradientStats g(*gradients_t, *hessians_t, bucket_idx);
         g *= normalizer_ratio;
         left_gradient_stats += g;
-        NodeStats left_stats = state.ComputeNodeStats(left_gradient_stats);
+        NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
         GradientStats right_gradient_stats =
             root_gradient_stats - left_gradient_stats;
-        NodeStats right_stats = state.ComputeNodeStats(right_gradient_stats);
+        NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
         if (left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
@@ -237,21 +283,125 @@
       SplitInfo split_info;
       auto* dense_split =
           split_info.mutable_split_node()->mutable_dense_float_binary_split();
-      dense_split->set_feature_column(state.feature_column_group_id());
+      dense_split->set_feature_column(state->feature_column_group_id());
       dense_split->set_threshold(
           bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
 
-      state.FillLeaf(best_left_node_stats, left_child);
-      state.FillLeaf(best_right_node_stats, right_child);
-      split_info.SerializeToString(&output_splits(root_idx));
-      gains(root_idx) =
-          best_gain - root_stats.gain - state.tree_complexity_regularization();
-      output_partition_ids(root_idx) = partition_ids(start_index);
+      state->FillLeaf(best_left_node_stats, left_child);
+      state->FillLeaf(best_right_node_stats, right_child);
+      split_info.SerializeToString(&(*output_splits)(root_idx));
+      (*gains)(root_idx) =
+          best_gain - root_stats.gain - state->tree_complexity_regularization();
+      (*output_partition_ids)(root_idx) = partition_ids(start_index);
     }
   }
+  void ComputeObliviousDecisionTree(
+      SplitBuilderState* state, const float normalizer_ratio,
+      const int num_elements, const std::vector<int32>& partition_boundaries,
+      const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
+      const tensorflow::TTypes<int32>::ConstVec& partition_ids,
+      const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
+      const Tensor* gradients_t, const Tensor* hessians_t,
+      tensorflow::TTypes<int32>::Vec* output_partition_ids,
+      tensorflow::TTypes<float>::Vec* gains,
+      tensorflow::TTypes<string>::Vec* output_splits) {
+    // Holds the root stats per each node to be split.
+    std::vector<GradientStats> current_layer_stats;
+    current_layer_stats.reserve(num_elements);
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      const int start_index = partition_boundaries[root_idx];
+      const int end_index = partition_boundaries[root_idx + 1];
+      GradientStats root_gradient_stats;
+      for (int64 bucket_idx = start_index; bucket_idx < end_index;
+           ++bucket_idx) {
+        root_gradient_stats +=
+            GradientStats(*gradients_t, *hessians_t, bucket_idx);
+      }
+      root_gradient_stats *= normalizer_ratio;
+      current_layer_stats.push_back(root_gradient_stats);
+    }
+
+    float best_gain = std::numeric_limits<float>::lowest();
+    int64 best_bucket_idx = 0;
+    std::vector<NodeStats> best_right_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> best_left_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> current_left_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> current_right_node_stats(num_elements, NodeStats(0));
+    int64 current_bucket_id = 0;
+    int64 last_bucket_id = -1;
+    // Indexes offsets for each of the partitions that can be used to access
+    // gradients of a partition for a current bucket we consider.
+    std::vector<int> current_layer_offsets(num_elements, 0);
+    std::vector<GradientStats> left_gradient_stats(num_elements);
+    // The idea is to try every bucket id in increasing order. In each iteration
+    // we calculate the gain of the layer using the current bucket id as split
+    // value, and we also obtain the following bucket id to try.
+    while (current_bucket_id > last_bucket_id) {
+      last_bucket_id = current_bucket_id;
+      int64 next_bucket_id = -1;
+      for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+        int idx =
+            current_layer_offsets[root_idx] + partition_boundaries[root_idx];
+        const int end_index = partition_boundaries[root_idx + 1];
+        if (idx < end_index && bucket_ids(idx, 0) == current_bucket_id) {
+          GradientStats g(*gradients_t, *hessians_t, idx);
+          g *= normalizer_ratio;
+          left_gradient_stats[root_idx] += g;
+          current_layer_offsets[root_idx]++;
+          idx++;
+        }
+        if (idx < end_index &&
+            (bucket_ids(idx, 0) < next_bucket_id || next_bucket_id == -1)) {
+          next_bucket_id = bucket_ids(idx, 0);
+        }
+      }
+      float gain_of_split = 0.0;
+      for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+        GradientStats right_gradient_stats =
+            current_layer_stats[root_idx] - left_gradient_stats[root_idx];
+        NodeStats left_stat =
+            state->ComputeNodeStats(left_gradient_stats[root_idx]);
+        NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats);
+        gain_of_split += left_stat.gain + right_stat.gain;
+        current_left_node_stats[root_idx] = left_stat;
+        current_right_node_stats[root_idx] = right_stat;
+      }
+      if (gain_of_split > best_gain) {
+        best_gain = gain_of_split;
+        best_left_node_stats = current_left_node_stats;
+        best_right_node_stats = current_right_node_stats;
+      }
+      current_bucket_id = next_bucket_id;
+    }
+
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain;
+    }
+    best_gain -= num_elements * state->tree_complexity_regularization();
+
+    ObliviousSplitInfo oblivious_split_info;
+    auto* oblivious_dense_split = oblivious_split_info.mutable_split_node()
+                                      ->mutable_dense_float_binary_split();
+    oblivious_dense_split->set_feature_column(state->feature_column_group_id());
+    oblivious_dense_split->set_threshold(
+        bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
+    (*gains)(0) = best_gain;
+
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      auto* left_children = oblivious_split_info.add_children_leaves();
+      auto* right_children = oblivious_split_info.add_children_leaves();
+
+      state->FillLeaf(best_left_node_stats[root_idx], left_children);
+      state->FillLeaf(best_right_node_stats[root_idx], right_children);
+
+      const int start_index = partition_boundaries[root_idx];
+      (*output_partition_ids)(root_idx) = partition_ids(start_index);
+    }
+    oblivious_split_info.SerializeToString(&(*output_splits)(0));
+  }
 };
 REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
                         BuildDenseInequalitySplitsOp);
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
index 1b7f59e..5d4819b 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
@@ -132,6 +132,10 @@
     return control_flow_ops.group(update_1, *update_2[self])
 
   @abc.abstractmethod
+  def reset(self, stamp_token, next_stamp_token):
+    """Resets the state maintained by the handler."""
+
+  @abc.abstractmethod
   def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state.
 
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index bf68623..efe2921 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -202,3 +202,7 @@
     # always return ready.
     are_splits_ready = constant_op.constant(True)
     return (are_splits_ready, partition_ids, gains, split_infos)
+
+  def reset(self, stamp_token, next_stamp_token):
+    reset = self._stats_accumulator.flush(stamp_token, next_stamp_token)
+    return reset
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index df0bec1..f45010e 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -64,6 +64,7 @@
 import re
 
 from tensorflow.contrib.boosted_trees.lib.learner.batch import base_split_handler
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.python.ops import gen_quantile_ops
 from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops
 from tensorflow.contrib.boosted_trees.python.ops import quantile_ops
@@ -79,6 +80,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
+
 _BIAS_FEATURE_ID = -1
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
@@ -147,6 +149,11 @@
           num_quantiles=num_quantiles,
           name="QuantileAccumulator/{}".format(self._name))
 
+  def reset(self, stamp_token, next_stamp_token):
+    reset_1 = self._stats_accumulator.flush(stamp_token, next_stamp_token)
+    reset_2 = self._quantile_accumulator.flush(stamp_token, next_stamp_token)
+    return control_flow_ops.group([reset_1, reset_2])
+
 
 class DenseSplitHandler(InequalitySplitHandler):
   """Computes stats and finds the best inequality splits on dense columns."""
@@ -165,6 +172,7 @@
                multiclass_strategy,
                init_stamp_token=0,
                loss_uses_sum_reduction=False,
+               weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE,
                name=None):
     """Initialize the internal state for this split handler.
 
@@ -186,6 +194,7 @@
          stamped objects.
       loss_uses_sum_reduction: A scalar boolean tensor that specifies whether
           SUM or MEAN reduction was used for the loss.
+      weak_learner_type: Specifies the type of weak learner to use.
       name: An optional handler name.
     """
     super(DenseSplitHandler, self).__init__(
@@ -203,6 +212,7 @@
         multiclass_strategy=multiclass_strategy,
         loss_uses_sum_reduction=loss_uses_sum_reduction)
     self._dense_float_column = dense_float_column
+    self._weak_learner_type = weak_learner_type
     # Register dense_make_stats_update function as an Op to the graph.
     g = ops.get_default_graph()
     dense_make_stats_update.add_to_graph(g)
@@ -263,15 +273,17 @@
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
-                self._min_node_weight, self._loss_uses_sum_reduction))
+                self._min_node_weight, self._loss_uses_sum_reduction,
+                self._weak_learner_type))
     return are_splits_ready, partition_ids, gains, split_infos
 
 
-def _make_dense_split(
-    quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
-    next_stamp_token, multiclass_strategy, class_id, feature_column_id,
-    l1_regularization, l2_regularization, tree_complexity_regularization,
-    min_node_weight, is_multi_dimentional, loss_uses_sum_reduction):
+def _make_dense_split(quantile_accumulator_handle, stats_accumulator_handle,
+                      stamp_token, next_stamp_token, multiclass_strategy,
+                      class_id, feature_column_id, l1_regularization,
+                      l2_regularization, tree_complexity_regularization,
+                      min_node_weight, is_multi_dimentional,
+                      loss_uses_sum_reduction, weak_learner_type):
   """Function that builds splits for a dense feature column."""
   # Get the bucket boundaries
   are_splits_ready, buckets = (
@@ -320,7 +332,8 @@
           l2_regularization=l2_regularization,
           tree_complexity_regularization=tree_complexity_regularization,
           min_node_weight=min_node_weight,
-          multiclass_strategy=multiclass_strategy))
+          multiclass_strategy=multiclass_strategy,
+          weak_learner_type=weak_learner_type))
   return are_splits_ready, partition_ids, gains, split_infos
 
 
@@ -500,7 +513,40 @@
   return are_splits_ready, partition_ids, gains, split_infos
 
 
-def _specialize_make_split(func, is_multi_dimentional):
+def _specialize_make_split_dense(func, is_multi_dimentional):
+  """Builds a specialized version of the function."""
+
+  @function.Defun(
+      dtypes.resource,
+      dtypes.resource,
+      dtypes.int64,
+      dtypes.int64,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.bool,
+      dtypes.int32,
+      noinline=True)
+  def f(quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
+        next_stamp_token, multiclass_strategy, class_id, feature_column_id,
+        l1_regularization, l2_regularization, tree_complexity_regularization,
+        min_node_weight, loss_uses_sum_reduction, weak_learner_type):
+    """Function that builds splits for a sparse feature column."""
+    return func(quantile_accumulator_handle, stats_accumulator_handle,
+                stamp_token, next_stamp_token, multiclass_strategy, class_id,
+                feature_column_id, l1_regularization, l2_regularization,
+                tree_complexity_regularization, min_node_weight,
+                is_multi_dimentional, loss_uses_sum_reduction,
+                weak_learner_type)
+
+  return f
+
+
+def _specialize_make_split_sparse(func, is_multi_dimentional):
   """Builds a specialized version of the function."""
 
   @function.Defun(
@@ -530,15 +576,17 @@
 
   return f
 
-make_dense_split_scalar = _specialize_make_split(_make_dense_split,
-                                                 is_multi_dimentional=False)
-make_dense_split_tensor = _specialize_make_split(_make_dense_split,
-                                                 is_multi_dimentional=True)
 
-make_sparse_split_scalar = _specialize_make_split(_make_sparse_split,
-                                                  is_multi_dimentional=False)
-make_sparse_split_tensor = _specialize_make_split(_make_sparse_split,
-                                                  is_multi_dimentional=True)
+make_dense_split_scalar = _specialize_make_split_dense(
+    _make_dense_split, is_multi_dimentional=False)
+
+make_dense_split_tensor = _specialize_make_split_dense(
+    _make_dense_split, is_multi_dimentional=True)
+
+make_sparse_split_scalar = _specialize_make_split_sparse(
+    _make_sparse_split, is_multi_dimentional=False)
+make_sparse_split_tensor = _specialize_make_split_sparse(
+    _make_sparse_split, is_multi_dimentional=True)
 
 
 @function.Defun(
@@ -579,8 +627,10 @@
 
   example_partition_ids, feature_ids, gradients, hessians = (
       control_flow_ops.cond(
-          math_ops.logical_and(are_buckets_ready, is_active[0]),
-          ready_inputs_fn, not_ready_inputs_fn))
+          math_ops.logical_and(
+              math_ops.logical_and(are_buckets_ready,
+                                   array_ops.size(quantile_buckets) > 0),
+              is_active[0]), ready_inputs_fn, not_ready_inputs_fn))
   return (quantile_values, quantile_weights, example_partition_ids, feature_ids,
           gradients, hessians)
 
@@ -674,8 +724,10 @@
                             lambda: handler_not_active))
 
   example_partition_ids, feature_ids, gradients, hessians = (
-      control_flow_ops.cond(are_buckets_ready, quantiles_ready,
-                            quantiles_not_ready))
+      control_flow_ops.cond(
+          math_ops.logical_and(are_buckets_ready,
+                               array_ops.size(quantile_buckets) > 0),
+          quantiles_ready, quantiles_not_ready))
 
   return (quantile_indices, quantile_values, quantile_shape, quantile_weights,
           example_partition_ids, feature_ids, gradients, hessians)
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index d59732c..6572f2f 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -182,6 +182,133 @@
 
     self.assertAllClose(0.52, split_node.threshold, 0.00001)
 
+  def testObliviousFeatureSplitGeneration(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 2              |
+      # i1      |  (-0.5, 0.07) | 0         | 2              |
+      # i2      |  (1.2, 0.2)   | 0         | 0              |
+      # i3      |  (4.0, 0.13)  | 1         | 1              |
+      dense_column = array_ops.constant([0.62, 0.62, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      class_id = -1
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([0, 1], partitions)
+
+    oblivious_split_info = split_info_pb2.ObliviousSplitInfo()
+    oblivious_split_info.ParseFromString(splits[0])
+    split_node = oblivious_split_info.split_node.dense_float_binary_split
+
+    self.assertAllClose(0.3, split_node.threshold, 0.00001)
+    self.assertEqual(0, split_node.feature_column)
+
+    # Check the split on partition 0.
+    # -(1.2 - 0.1) / (0.2 + 1)
+    expected_left_weight_0 = -0.9166666666666666
+
+    # expected_left_weight_0 * -(1.2 - 0.1)
+    expected_left_gain_0 = 1.008333333333333
+
+    # (-0.5 + 0.2 + 0.1) / (0.19 + 1)
+    expected_right_weight_0 = 0.1680672
+
+    # expected_right_weight_0 * -(-0.5 + 0.2 + 0.1))
+    expected_right_gain_0 = 0.033613445378151252
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain_0 = 0.46043165467625896
+
+    left_child = oblivious_split_info.children_leaves[0].vector
+    right_child = oblivious_split_info.children_leaves[1].vector
+
+    self.assertAllClose([expected_left_weight_0], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight_0], right_child.value, 0.00001)
+
+    # Check the split on partition 1.
+    expected_left_weight_1 = 0
+    expected_left_gain_1 = 0
+    # -(4 - 0.1) / (0.13 + 1)
+    expected_right_weight_1 = -3.4513274336283186
+    # expected_right_weight_1 * -(4 - 0.1)
+    expected_right_gain_1 = 13.460176991150442
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain_1 = 13.460176991150442
+
+    left_child = oblivious_split_info.children_leaves[2].vector
+    right_child = oblivious_split_info.children_leaves[3].vector
+
+    self.assertAllClose([expected_left_weight_1], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight_1], right_child.value, 0.00001)
+
+    # The layer gain is the sum of the gains of each partition
+    layer_gain = (
+        expected_left_gain_0 + expected_right_gain_0 - expected_bias_gain_0) + (
+            expected_left_gain_1 + expected_right_gain_1 - expected_bias_gain_1)
+    self.assertAllClose(layer_gain, gains[0], 0.00001)
+
   def testGenerateFeatureSplitCandidatesLossUsesSumReduction(self):
     with self.test_session() as sess:
       # The data looks like the following:
@@ -1072,8 +1199,8 @@
   def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
     with self.test_session() as sess:
       # Batch is 4, 2 classes
-      gradients = array_ops.constant(
-          [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]])
+      gradients = array_ops.constant([[0.2, 1.4], [-0.5, 0.1], [1.2, 3],
+                                      [4.0, -3]])
       # 2x2 matrix for each instance
       hessian_0 = [[0.12, 0.02], [0.3, 0.11]]
       hessian_1 = [[0.07, -0.2], [-0.5, 0.2]]
@@ -1167,8 +1294,8 @@
   def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
     with self.test_session() as sess:
       # Batch is 4, 2 classes
-      gradients = array_ops.constant(
-          [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]])
+      gradients = array_ops.constant([[0.2, 1.4], [-0.5, 0.1], [1.2, 3],
+                                      [4.0, -3]])
       # Each hessian is a diagonal from a full hessian matrix.
       hessian_0 = [0.12, 0.11]
       hessian_1 = [0.07, 0.2]
@@ -1406,6 +1533,100 @@
     self.assertEqual(len(gains), 0)
     self.assertEqual(len(splits), 0)
 
+  def testEmptyBuckets(self):
+    """Test that reproduces the case when quantile buckets were empty."""
+    with self.test_session() as sess:
+      sparse_column = array_ops.sparse_placeholder(dtypes.float32)
+
+      # We have two batches - at first, a sparse feature is empty.
+      empty_indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      empty_values = array_ops.constant([], dtype=dtypes.float32)
+      empty_sparse_column = sparse_tensor.SparseTensor(empty_indices,
+                                                       empty_values, [4, 2])
+      empty_sparse_column = empty_sparse_column.eval(session=sess)
+
+      # For the second batch, the sparse feature is not empty.
+      non_empty_indices = array_ops.constant(
+          [[0, 0], [2, 1], [3, 2]], dtype=dtypes.int64, shape=[3, 2])
+      non_empty_values = array_ops.constant(
+          [0.52, 0.3, 0.52], dtype=dtypes.float32)
+      non_empty_sparse_column = sparse_tensor.SparseTensor(
+          non_empty_indices, non_empty_values, [4, 2])
+      non_empty_sparse_column = non_empty_sparse_column.eval(session=sess)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
+
+        # First, calculate quantiles and try to update on an empty data for a
+        # feature.
+        are_splits_ready = (
+            sess.run(
+                are_splits_ready,
+                feed_dict={sparse_column: empty_sparse_column}))
+        self.assertFalse(are_splits_ready)
+
+      update_2 = split_handler.update_stats_sync(
+          1,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
+
+        # Now the feature in the second batch is not empty, but buckets
+        # calculated on the first batch are empty.
+        are_splits_ready2, partitions, gains, splits = (
+            sess.run(
+                [are_splits_ready2, partitions, gains, splits],
+                feed_dict={sparse_column: non_empty_sparse_column}))
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+    # Since the buckets were empty, we can't calculate the splits.
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
   def testDegenerativeCase(self):
     with self.test_session() as sess:
       # One data example only, one leaf and thus one quantile bucket.The same
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index ca5c7f3..9b68a9d 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -36,6 +36,7 @@
     .Input("tree_complexity_regularization: float")
     .Input("min_node_weight: float")
     .Input("multiclass_strategy: int32")
+    .Input("weak_learner_type: int32")
     .Output("output_partition_ids: int32")
     .Output("gains: float32")
     .Output("split_infos: string")
@@ -84,6 +85,8 @@
     be considered.
 multiclass_strategy: A scalar, specifying the multiclass handling strategy.
     See LearnerConfig.MultiClassStrategy for valid values.
+weak_learner_type: A scalar, specifying the weak learner type to use.
+    See LearnerConfig.WeakLearnerType for valid values.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
     for.
 gains: A rank 1 tensor, for the computed gain for the created splits.
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
index d84ba74..c49cb48 100644
--- a/tensorflow/contrib/boosted_trees/proto/learner.proto
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -108,6 +108,11 @@
     DIAGONAL_HESSIAN = 3;
   }
 
+  enum WeakLearnerType {
+    NORMAL_DECISION_TREE = 0;
+    OBLIVIOUS_DECISION_TREE = 1;
+  }
+
   // Number of classes.
   uint32 num_classes = 1;
 
@@ -141,4 +146,7 @@
   // If you want to average the ensembles (for regularization), provide the
   // config below.
   AveragingConfig averaging_config = 11;
+
+  // By default we use NORMAL_DECISION_TREE as weak learner.
+  WeakLearnerType weak_learner_type = 12;
 }
diff --git a/tensorflow/contrib/boosted_trees/proto/split_info.proto b/tensorflow/contrib/boosted_trees/proto/split_info.proto
index a300c24..850340f 100644
--- a/tensorflow/contrib/boosted_trees/proto/split_info.proto
+++ b/tensorflow/contrib/boosted_trees/proto/split_info.proto
@@ -17,3 +17,10 @@
   // Right Leaf node.
   tensorflow.boosted_trees.trees.Leaf right_child = 3;
 }
+
+message ObliviousSplitInfo {
+  // The split node with the feature_column and threshold defined.
+  tensorflow.boosted_trees.trees.TreeNode split_node = 1;
+  // The new leaves of the tree.
+  repeated tensorflow.boosted_trees.trees.Leaf children_leaves = 2;
+}
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index 5cd37ec..2589504 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -59,7 +59,8 @@
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     self.assertAllEqual([0, 1], partitions)
 
@@ -132,7 +133,8 @@
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
+              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     self.assertAllEqual([0, 1], partitions)
 
@@ -171,7 +173,8 @@
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     # .assertEmpty doesn't exist on ubuntu-contrib
     self.assertEqual(0, len(partitions))
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index ba5ef70..2f75d8a 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -51,6 +51,7 @@
 from tensorflow.python.summary import summary
 from tensorflow.python.training import device_setter
 
+
 # Key names for prediction dict.
 ENSEMBLE_STAMP = "ensemble_stamp"
 PREDICTIONS = "predictions"
@@ -217,6 +218,21 @@
   sparse_int_shapes = []
   for key in sorted(features.keys()):
     tensor = features[key]
+    # TODO(nponomareva): consider iterating over feature columns instead.
+    if isinstance(tensor, tuple):
+      # Weighted categorical feature.
+      categorical_tensor = tensor[0]
+      weight_tensor = tensor[1]
+
+      shape = categorical_tensor.dense_shape
+      indices = array_ops.concat([
+          array_ops.slice(categorical_tensor.indices, [0, 0], [-1, 1]),
+          array_ops.expand_dims(
+              math_ops.to_int64(categorical_tensor.values), -1)
+      ], 1)
+      tensor = sparse_tensor.SparseTensor(
+          indices=indices, values=weight_tensor.values, dense_shape=shape)
+
     if isinstance(tensor, sparse_tensor.SparseTensor):
       if tensor.values.dtype == dtypes.float32:
         sparse_float_names.append(key)
@@ -671,6 +687,8 @@
         self._learner_config.constraints.min_node_weight, dtypes.float32)
     loss_uses_sum_reduction = self._loss_reduction == losses.Reduction.SUM
     loss_uses_sum_reduction = constant_op.constant(loss_uses_sum_reduction)
+    weak_learner_type = constant_op.constant(
+        self._learner_config.weak_learner_type)
     epsilon = 0.01
     num_quantiles = 100
     strategy_tensor = constant_op.constant(strategy)
@@ -695,6 +713,7 @@
                 multiclass_strategy=strategy_tensor,
                 init_stamp_token=init_stamp_token,
                 loss_uses_sum_reduction=loss_uses_sum_reduction,
+                weak_learner_type=weak_learner_type,
             ))
         fc_name_idx += 1
 
@@ -898,7 +917,7 @@
 
       reset_ops = []
       for handler in handlers:
-        reset_ops.append(handler.make_splits(stamp_token, next_stamp_token, 0))
+        reset_ops.append(handler.reset(stamp_token, next_stamp_token))
       if self._center_bias:
         reset_ops.append(
             bias_stats_accumulator.flush(stamp_token, next_stamp_token))
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 2fbaa31..150d734 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -31,6 +31,12 @@
 @@List
 @@Mapping
 @@UniqueNameTracker
+
+Checkpoint management:
+@@CheckpointManager
+
+Saving and restoring Python state:
+@@NumpyState
 """
 
 from __future__ import absolute_import
@@ -38,9 +44,11 @@
 from __future__ import print_function
 
 from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker
+from tensorflow.contrib.checkpoint.python.python_state import NumpyState
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
 from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
+from tensorflow.python.training.checkpoint_management import CheckpointManager
 from tensorflow.python.training.checkpointable.base import CheckpointableBase
 from tensorflow.python.training.checkpointable.data_structures import List
 from tensorflow.python.training.checkpointable.data_structures import Mapping
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index 7b200a2..ada4168 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -9,6 +9,7 @@
     srcs_version = "PY2AND3",
     deps = [
         ":containers",
+        ":python_state",
         ":split_dependency",
         ":visualize",
         "//tensorflow/python/training/checkpointable:data_structures",
@@ -41,6 +42,33 @@
 )
 
 py_library(
+    name = "python_state",
+    srcs = ["python_state.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python/training/checkpointable:base",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "python_state_test",
+    srcs = ["python_state_test.py"],
+    deps = [
+        ":python_state",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
     name = "split_dependency",
     srcs = ["split_dependency.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
new file mode 100644
index 0000000..9b11035
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -0,0 +1,166 @@
+"""Utilities for including Python state in TensorFlow checkpoints."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy
+
+from tensorflow.python.training.checkpointable import base
+
+# pylint: disable=g-import-not-at-top
+try:
+  # In Python 2.x, use the faster string buffering option.
+  from cStringIO import StringIO as BytesIO
+except ImportError:
+  from io import BytesIO
+# pylint: enable=g-import-not-at-top
+
+
+class NumpyState(base.CheckpointableBase):
+  """A checkpointable object whose NumPy array attributes are saved/restored.
+
+  Example usage:
+
+  ```python
+  arrays = tf.contrib.checkpoint.NumpyState()
+  checkpoint = tf.train.Checkpoint(numpy_arrays=arrays)
+  arrays.x = numpy.zeros([3, 4])
+  save_path = checkpoint.save("/tmp/ckpt")
+  arrays.x[1, 1] = 4.
+  checkpoint.restore(save_path)
+  assert (arrays.x == numpy.zeros([3, 4])).all()
+
+  second_checkpoint = tf.train.Checkpoint(
+      numpy_arrays=tf.contrib.checkpoint.NumpyState())
+  # Attributes of NumpyState objects are created automatically by restore()
+  second_checkpoint.restore(save_path)
+  assert (second_checkpoint.numpy_arrays.x == numpy.zeros([3, 4])).all()
+  ```
+
+  Note that `NumpyState` objects re-create the attributes of the previously
+  saved object on `restore()`. This is in contrast to TensorFlow variables, for
+  which a `Variable` object must be created and assigned to an attribute.
+
+  This snippet works both when graph building and when executing eagerly. On
+  save, the NumPy array(s) are fed as strings to be saved in the checkpoint (via
+  a placeholder when graph building, or as a string constant when executing
+  eagerly). When restoring they skip the TensorFlow graph entirely, and so no
+  restore ops need be run. This means that restoration always happens eagerly,
+  rather than waiting for `checkpoint.restore(...).run_restore_ops()` like
+  TensorFlow variables when graph building.
+  """
+
+  def _lookup_dependency(self, name):
+    """Create placeholder NumPy arrays for to-be-restored attributes.
+
+    Typically `_lookup_dependency` is used to check by name whether a dependency
+    exists. We cheat slightly by creating a checkpointable object for `name` if
+    we don't already have one, giving us attribute re-creation behavior when
+    loading a checkpoint.
+
+    Args:
+      name: The name of the dependency being checked.
+    Returns:
+      An existing dependency if one exists, or a new `_NumpyWrapper` placeholder
+      dependency (which will generally be restored immediately).
+    """
+    value = super(NumpyState, self)._lookup_dependency(name)
+    if value is None:
+      value = _NumpyWrapper(numpy.array([]))
+      new_reference = base.CheckpointableReference(name=name, ref=value)
+      self._unconditional_checkpoint_dependencies.append(new_reference)
+      self._unconditional_dependency_names[name] = value
+      super(NumpyState, self).__setattr__(name, value)
+    return value
+
+  def __getattribute__(self, name):
+    """Un-wrap `_NumpyWrapper` objects when accessing attributes."""
+    value = super(NumpyState, self).__getattribute__(name)
+    if isinstance(value, _NumpyWrapper):
+      return value.array
+    return value
+
+  def __setattr__(self, name, value):
+    """Automatically wrap NumPy arrays assigned to attributes."""
+    # TODO(allenl): Consider supporting lists/tuples, either ad-hoc or by making
+    # ndarrays checkpointable natively and using standard checkpointable list
+    # tracking.
+    if isinstance(value, numpy.ndarray):
+      try:
+        existing = super(NumpyState, self).__getattribute__(name)
+        existing.array = value
+        return
+      except AttributeError:
+        value = _NumpyWrapper(value)
+        self._track_checkpointable(value, name=name, overwrite=True)
+    elif (name not in ("_setattr_tracking", "_update_uid")
+          and getattr(self, "_setattr_tracking", True)):
+      # Mixing restore()-created attributes with user-added checkpointable
+      # objects is tricky, since we can't use the `_lookup_dependency` trick to
+      # re-create attributes (we might accidentally steal the restoration for
+      # another checkpointable object). For now `NumpyState` objects must be
+      # leaf nodes. Theoretically we could add some extra arguments to
+      # `_lookup_dependency` to figure out whether we should create a NumPy
+      # array for the attribute or not.
+      raise NotImplementedError(
+          ("Assigned %s to the %s property of %s, which is not a NumPy array. "
+           "Currently mixing NumPy arrays and other checkpointable objects is "
+           "not supported. File a feature request if this limitation bothers "
+           "you.")
+          % (value, name, self))
+    super(NumpyState, self).__setattr__(name, value)
+
+
+class _NumpyWrapper(base.CheckpointableBase):
+  """Wraps a NumPy array for storage in an object-based checkpoint."""
+
+  def __init__(self, array):
+    """Specify a NumPy array to wrap.
+
+    Args:
+      array: The NumPy array to save and restore (may be overwritten).
+    """
+    self.array = array
+
+  def _serialize(self):
+    """Callback for `PythonStringStateSaveable` to serialize the array."""
+    string_file = BytesIO()
+    try:
+      numpy.save(string_file, self.array, allow_pickle=False)
+      serialized = string_file.getvalue()
+    finally:
+      string_file.close()
+    return serialized
+
+  def _deserialize(self, string_value):
+    """Callback for `PythonStringStateSaveable` to deserialize the array."""
+    string_file = BytesIO(string_value)
+    try:
+      self.array = numpy.load(string_file, allow_pickle=False)
+    finally:
+      string_file.close()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Specify callbacks for saving and restoring `array`."""
+    return {
+        "array": functools.partial(
+            base.PythonStringStateSaveable,
+            state_callback=self._serialize,
+            restore_callback=self._deserialize)
+        }
diff --git a/tensorflow/contrib/checkpoint/python/python_state_test.py b/tensorflow/contrib/checkpoint/python/python_state_test.py
new file mode 100644
index 0000000..0439a47
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/python_state_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy
+
+from tensorflow.contrib.checkpoint.python import python_state
+from tensorflow.python.client import session
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import util
+
+
+class NumpyStateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestoreNumpyState(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "ckpt")
+    save_state = python_state.NumpyState()
+    saver = util.Checkpoint(numpy=save_state)
+    save_state.a = numpy.ones([2, 2])
+    save_state.b = numpy.ones([2, 2])
+    save_state.b = numpy.zeros([2, 2])
+    self.assertAllEqual(numpy.ones([2, 2]), save_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), save_state.b)
+    first_save_path = saver.save(prefix)
+    save_state.a[1, 1] = 2.
+    second_save_path = saver.save(prefix)
+
+    load_state = python_state.NumpyState()
+    loader = util.Checkpoint(numpy=load_state)
+    loader.restore(first_save_path).initialize_or_restore()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+    load_state.a[0, 0] = 42.
+    self.assertAllEqual([[42., 1.], [1., 1.]], load_state.a)
+    loader.restore(first_save_path).run_restore_ops()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    loader.restore(second_save_path).run_restore_ops()
+    self.assertAllEqual([[1., 1.], [1., 2.]], load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+
+  def testNoGraphPollution(self):
+    graph = ops.Graph()
+    with graph.as_default(), session.Session():
+      directory = self.get_temp_dir()
+      prefix = os.path.join(directory, "ckpt")
+      save_state = python_state.NumpyState()
+      saver = util.Checkpoint(numpy=save_state)
+      save_state.a = numpy.ones([2, 2])
+      save_path = saver.save(prefix)
+      saver.restore(save_path)
+      graph.finalize()
+      saver.save(prefix)
+      save_state.a = numpy.zeros([2, 2])
+      saver.save(prefix)
+      saver.restore(save_path)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoMixedNumpyStateTF(self):
+    save_state = python_state.NumpyState()
+    save_state.a = numpy.ones([2, 2])
+    with self.assertRaises(NotImplementedError):
+      save_state.v = variables.Variable(1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDocstringExample(self):
+    arrays = python_state.NumpyState()
+    checkpoint = util.Checkpoint(numpy_arrays=arrays)
+    arrays.x = numpy.zeros([3, 4])
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    arrays.x[1, 1] = 4.
+    checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), arrays.x)
+
+    second_checkpoint = util.Checkpoint(numpy_arrays=python_state.NumpyState())
+    second_checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), second_checkpoint.numpy_arrays.x)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
index 95e7e74..cb45e42 100644
--- a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import json
+import os
 
 from tensorflow.contrib.cloud.python.ops import gen_gcs_config_ops
 from tensorflow.python.framework import dtypes
@@ -188,6 +189,8 @@
     session: A `tf.Session` session.
   """
   # Read from the application default credentials (adc).
-  with open('/content/datalab/adc.json') as f:
+  adc_filename = os.environ.get(
+      'GOOGLE_APPLICATION_CREDENTIALS', '/content/adc.json')
+  with open(adc_filename) as f:
     data = json.load(f)
   configure_gcs(session, credentials=data)
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 9045290..a5a947f 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -186,6 +186,8 @@
 tensorflow/contrib/grid_rnn
 tensorflow/contrib/grid_rnn/python
 tensorflow/contrib/grid_rnn/python/ops
+tensorflow/contrib/hadoop/python
+tensorflow/contrib/hadoop/python/ops
 tensorflow/contrib/hooks
 tensorflow/contrib/hooks/python
 tensorflow/contrib/image
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 5cb0db6..6d86daf 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -198,7 +198,7 @@
             # so we currently add explicit commands to include those files
             # later on in this script.
             if (NOT "${script}" MATCHES "_test\.py$")
-	        add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
+            add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
                   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/${script} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/${script})
             endif()
         endforeach()
@@ -297,7 +297,7 @@
     )
     target_link_libraries(${tf_python_op_lib_name}_gen_python PRIVATE
         tf_protos_cc
-				tf_python_protos_cc
+                tf_python_protos_cc
         ${tensorflow_EXTERNAL_LIBRARIES}
     )
 
@@ -549,15 +549,15 @@
         ${NUMPY_INCLUDE_DIR}
     )
     #target_link_libraries(pywrap_tensorflow_internal_static
-    #	tf_protos_cc
-    #	tf_python_protos_cc
+    #   tf_protos_cc
+    #   tf_python_protos_cc
     #)
     add_dependencies(pywrap_tensorflow_internal_static tf_protos_cc tf_python_protos_cc)
     set(pywrap_tensorflow_internal_static_dependencies
         $<TARGET_FILE:pywrap_tensorflow_internal_static>
         $<TARGET_FILE:tf_protos_cc>
         $<TARGET_FILE:tf_python_protos_cc>
-	${nsync_STATIC_LIBRARIES}
+    ${nsync_STATIC_LIBRARIES}
     )
 
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -763,57 +763,40 @@
 # recongnize paths. As CUDA isn't built with MKL, the MKL built directory is the only path to this command to work around that issue.
 # To not override the CUDA and system path in other circumstances, `if-else` branch used here to handle this problem,
 # and should be removed if the path issue can be resolved.
+# UPDATE: Below block appears to handle multiple items in PATH correctly, but risks command line limits if PATH is large.
+# If you have issues, try `set(PY_RUNTIME_ENV "PATH=${mkl_BIN_DIRS}")` instead.
 ###
 
-if (tensorflow_ENABLE_MKL_SUPPORT)
+set(PY_RUNTIME_ENV "")
+if(tensorflow_ENABLE_MKL_SUPPORT)
     # add mkl dist dlls to system path for python
-    # TODO: In current cmake version, PY_RUNTIME_ENV behaves strange with multiple paths,
-    # so we have to specify only one path in it to work around the issue. We need this if/else
-    # to protect overwriting CUDA environments
-    set(PY_RUNTIME_ENV ${mkl_BIN_DIRS})
-    add_custom_command(
-          OUTPUT ${api_init_files}
-          DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+    file(TO_CMAKE_PATH "$ENV{PATH}" PY_RUNTIME_ENV)
+    set(PY_RUNTIME_ENV ${mkl_BIN_DIRS} ${PY_RUNTIME_ENV})
+    file(TO_NATIVE_PATH "${PY_RUNTIME_ENV}" PY_RUNTIME_ENV)
+    set(PY_RUNTIME_ENV "PATH=${PY_RUNTIME_ENV}")
+endif(tensorflow_ENABLE_MKL_SUPPORT)
 
-          # tensorflow/__init__.py depends on files generated in this step. So, remove it while
-          # this step is running since the files aren't there yet.
-          COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
 
-          # Run create_python_api.py to generate API init files.
-          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python PATH=${PY_RUNTIME_ENV} ${PYTHON_EXECUTABLE}
-                  "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py"
-                  "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
-                  "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
-                  "--package=tensorflow.python"
-                  "--apiname=tensorflow"
-                  "${api_init_list_file}"
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
-          COMMENT "Generating __init__.py files for Python API."
-          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
-          VERBATIM
-    )
-else (tensorflow_ENABLE_MKL_SUPPORT)
-    add_custom_command(
-          OUTPUT ${api_init_files}
-          DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py"
+              "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+              "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+              "--package=tensorflow.python"
+              "--apiname=tensorflow"
+              "${api_init_list_file}"
 
-          # tensorflow/__init__.py depends on files generated in this step. So, remove it while
-          # this step is running since the files aren't there yet.
-          COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-
-          # Run create_python_api.py to generate API init files.
-          COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
-                  "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py"
-                  "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
-                  "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
-                  "--package=tensorflow.python"
-                  "--apiname=tensorflow"
-                  "${api_init_list_file}"
-
-          COMMENT "Generating __init__.py files for Python API."
-          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
-    )
-endif (tensorflow_ENABLE_MKL_SUPPORT)
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+      VERBATIM
+)
 
 add_custom_target(tf_python_api SOURCES ${api_init_files})
 add_dependencies(tf_python_api tf_python_ops)
@@ -848,12 +831,12 @@
       DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
 
       # Run create_python_api.py to generate API init files.
-      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE}
               "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py"
               "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/estimator/api"
               "--package=tensorflow.python.estimator"
               "--apiname=estimator"
-	      "--output_package=tensorflow.python.estimator.api"
+          "--output_package=tensorflow.python.estimator.api"
               "${estimator_api_init_list_file}"
 
       COMMENT "Generating __init__.py files for Python API."
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index b2330c4e..2c878c1 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -122,6 +122,17 @@
   endforeach()
 endfunction(AddPythonTests)
 
+#
+# ensure that every element is an existing file
+#
+function(CheckExists TYPE SOURCES)
+  foreach(source ${SOURCES})
+    if(NOT EXISTS ${source})
+      message(SEND_ERROR "${TYPE} not found: ${source}")
+    endif()
+  endforeach(source)
+endfunction(CheckExists)
+
 if (tensorflow_BUILD_PYTHON_TESTS)
   #
   # python tests. This assumes that the tensorflow wheel is
@@ -145,7 +156,6 @@
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
-    "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py"
@@ -198,7 +208,6 @@
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py"
     # requires scipy
-    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py"
     # Takes very long to run without sharding (defined in bazel build file).
@@ -256,10 +265,9 @@
       # Flaky because of local cluster creation.
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
-      "${tensorflow_source_dir}tensorflow/python/training/localhost_cluster_performance_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"
       "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"
       # Type error in testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU.
       "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py"
@@ -329,6 +337,7 @@
       "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py"  # b/72894325
   )
   endif()
+  CheckExists(${tf_test_src_py_exclude})
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
 
   AddPythonTests(
@@ -480,6 +489,7 @@
     "${tensorflow_source_dir}/tensorflow/cc/saved_model/*_test.cc"
   )
 
+  CheckExists(${tf_test_src_simple_exclude})
   list(REMOVE_ITEM tf_test_src_simple
     ${tf_test_src_simple_exclude}
     ${tf_cc_saved_model_test_srcs}
@@ -494,6 +504,7 @@
     ${tf_core_profiler_test_srcs}
   )
 
+  CheckExists(${tf_src_testlib})
   set(tf_test_lib tf_test_lib)
   add_library(${tf_test_lib} STATIC ${tf_src_testlib})
 
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py
index ac86a67..66d7ebe 100644
--- a/tensorflow/contrib/constrained_optimization/python/candidates.py
+++ b/tensorflow/contrib/constrained_optimization/python/candidates.py
@@ -204,7 +204,7 @@
   assert best_pp is not None
 
   # Throughout this loop, a maximum_violation of "lower" is not achievable,
-  # but a maximum_violation of "upper" is achiveable.
+  # but a maximum_violation of "upper" is achievable.
   while True:
     middle = 0.5 * (lower + upper)
     if (middle - lower <= epsilon) or (upper - middle <= epsilon):
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
index 70813fb..41258ed 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -72,7 +72,8 @@
     else:
       proxy_constraints_shape = self.proxy_constraints.get_shape()
 
-    if (constraints_shape is None or proxy_constraints_shape is None or
+    if (constraints_shape.ndims is None or
+        proxy_constraints_shape.ndims is None or
         any([ii is None for ii in constraints_shape.as_list()]) or
         any([ii is None for ii in proxy_constraints_shape.as_list()])):
       raise ValueError(
@@ -121,3 +122,19 @@
       A tensor of proxy constraint functions.
     """
     return None
+
+  # This is a property, instead of an abstract property, since it doesn't need
+  # to be overridden: if pre_train_ops returns None, then there are no ops to
+  # run before train_op.
+  @property
+  def pre_train_ops(self):
+    """Returns a list of `Operation`s to run before the train_op.
+
+    When a `ConstrainedOptimizer` creates a train_op (in `minimize`
+    `minimize_unconstrained`, or `minimize_constrained`), it will include these
+    ops before the main training step.
+
+    Returns:
+      A list of `Operation`s.
+    """
+    return None
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
index 8055545..0b79bdf 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
@@ -55,20 +55,21 @@
     """Returns the `tf.train.Optimizer` used for optimization."""
     return self._optimizer
 
-  def minimize_unconstrained(self,
-                             minimization_problem,
-                             global_step=None,
-                             var_list=None,
-                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                             aggregation_method=None,
-                             colocate_gradients_with_ops=False,
-                             name=None,
-                             grad_loss=None):
-    """Returns an `Op` for minimizing the unconstrained problem.
+  @abc.abstractmethod
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Version of `minimize_constrained` to be overridden by subclasses.
 
-    Unlike `minimize_constrained`, this function ignores the `constraints` (and
-    `proxy_constraints`) portion of the minimization problem entirely, and only
-    minimizes `objective`.
+    Implementations of this method should ignore the `pre_train_ops` property of
+    the `minimization_problem`. The public `minimize_constrained` method will
+    take care of executing these before the returned train_op.
 
     Args:
       minimization_problem: ConstrainedMinimizationProblem, the problem to
@@ -83,19 +84,10 @@
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
-    return self.optimizer.minimize(
-        minimization_problem.objective,
-        global_step=global_step,
-        var_list=var_list,
-        gate_gradients=gate_gradients,
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        name=name,
-        grad_loss=grad_loss)
+    pass
 
-  @abc.abstractmethod
   def minimize_constrained(self,
                            minimization_problem,
                            global_step=None,
@@ -105,7 +97,7 @@
                            colocate_gradients_with_ops=False,
                            name=None,
                            grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+    """Returns an `Operation` for minimizing the constrained problem.
 
     Unlike `minimize_unconstrained`, this function attempts to find a solution
     that minimizes the `objective` portion of the minimization problem while
@@ -124,9 +116,83 @@
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
-    pass
+
+    def train_op_callback():
+      return self._minimize_constrained(
+          minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    # If we have pre_train_ops, use tf.control_dependencies() to ensure that
+    # they execute before the train_op.
+    pre_train_ops = minimization_problem.pre_train_ops
+    if pre_train_ops:
+      with ops.control_dependencies(pre_train_ops):
+        train_op = train_op_callback()
+    else:
+      train_op = train_op_callback()
+
+    return train_op
+
+  def minimize_unconstrained(self,
+                             minimization_problem,
+                             global_step=None,
+                             var_list=None,
+                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                             aggregation_method=None,
+                             colocate_gradients_with_ops=False,
+                             name=None,
+                             grad_loss=None):
+    """Returns an `Operation` for minimizing the unconstrained problem.
+
+    Unlike `minimize_constrained`, this function ignores the `constraints` (and
+    `proxy_constraints`) portion of the minimization problem entirely, and only
+    minimizes `objective`.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      `Operation`, the train_op.
+    """
+
+    def train_op_callback():
+      return self.optimizer.minimize(
+          minimization_problem.objective,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    # If we have pre_train_ops, use tf.control_dependencies() to ensure that
+    # they execute before the train_op.
+    pre_train_ops = minimization_problem.pre_train_ops
+    if pre_train_ops:
+      with ops.control_dependencies(pre_train_ops):
+        train_op = train_op_callback()
+    else:
+      train_op = train_op_callback()
+
+    return train_op
 
   def minimize(self,
                minimization_problem,
@@ -138,7 +204,7 @@
                colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+    """Returns an `Operation` for minimizing the constrained problem.
 
     This method combines the functionality of `minimize_unconstrained` and
     `minimize_constrained`. If global_step < unconstrained_steps, it will
@@ -164,14 +230,14 @@
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
 
     Raises:
       ValueError: If unconstrained_steps is provided, but global_step is not.
     """
 
     def unconstrained_fn():
-      """Returns an `Op` for minimizing the unconstrained problem."""
+      """Returns an `Operation` for minimizing the unconstrained problem."""
       return self.minimize_unconstrained(
           minimization_problem=minimization_problem,
           global_step=global_step,
@@ -183,7 +249,7 @@
           grad_loss=grad_loss)
 
     def constrained_fn():
-      """Returns an `Op` for minimizing the constrained problem."""
+      """Returns an `Operation` for minimizing the constrained problem."""
       return self.minimize_constrained(
           minimization_problem=minimization_problem,
           global_step=global_step,
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
index 01c6e4f..d1af15f 100644
--- a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -70,11 +70,13 @@
       region w.r.t. the Euclidean norm.
 
   Raises:
-    ValueError: if the `multipliers` tensor does not have a fully-known shape,
-      or is not one-dimensional.
+    ValueError: if the `multipliers` tensor is not floating-point, does not have
+      a fully-known shape, or is not one-dimensional.
   """
+  if not multipliers.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   multipliers_shape = multipliers.get_shape()
-  if multipliers_shape is None:
+  if multipliers_shape.ndims is None:
     raise ValueError("multipliers must have known shape")
   if multipliers_shape.ndims != 1:
     raise ValueError(
@@ -101,12 +103,12 @@
         (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive)))
     multipliers += scale * inactive
-    new_inactive = standard_ops.to_float(multipliers > 0)
+    new_inactive = standard_ops.cast(multipliers > 0, multipliers.dtype)
     multipliers *= new_inactive
     return (iteration, multipliers, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
-  inactive = standard_ops.ones_like(multipliers)
+  inactive = standard_ops.ones_like(multipliers, dtype=multipliers.dtype)
 
   # We actually want a do-while loop, so we explicitly call while_loop_body()
   # once before tf.while_loop().
@@ -189,16 +191,16 @@
   def _projection_op(self, state, name=None):
     pass
 
-  def minimize_constrained(self,
-                           minimization_problem,
-                           global_step=None,
-                           var_list=None,
-                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                           aggregation_method=None,
-                           colocate_gradients_with_ops=False,
-                           name=None,
-                           grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Returns an `Operation` for minimizing the constrained problem.
 
     The `optimizer` constructor parameter will be used to update the model
     parameters, while the Lagrange multipliers will be updated using
@@ -216,8 +218,11 @@
       name: as in `tf.train.Optimizer`'s `minimize` method.
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
+    Raises:
+      ValueError: If the minimization_problem tensors have different dtypes.
+
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
     objective = minimization_problem.objective
 
@@ -225,6 +230,14 @@
     proxy_constraints = minimization_problem.proxy_constraints
     if proxy_constraints is None:
       proxy_constraints = constraints
+
+    # Make sure that the objective, constraints and proxy constraints all have
+    # the same dtype.
+    if (objective.dtype.base_dtype != constraints.dtype.base_dtype or
+        objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype):
+      raise ValueError("objective, constraints and proxy_constraints must "
+                       "have the same dtype")
+
     # Flatten both constraints tensors to 1d.
     num_constraints = minimization_problem.num_constraints
     constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
@@ -241,8 +254,10 @@
 
     multipliers = self._lagrange_multipliers(state)
     loss = (
-        objective + standard_ops.tensordot(multipliers, proxy_constraints, 1))
-    multipliers_gradient = constraints
+        objective + standard_ops.tensordot(
+            standard_ops.cast(multipliers, proxy_constraints.dtype),
+            proxy_constraints, 1))
+    multipliers_gradient = standard_ops.cast(constraints, multipliers.dtype)
 
     update_ops = []
     if self.constraint_optimizer is None:
@@ -356,6 +371,8 @@
     # For an AdditiveExternalRegretOptimizer, the internal state is simply a
     # tensor of Lagrange multipliers with shape (m,), where m is the number of
     # constraints.
+    #
+    # FUTURE WORK: make the dtype a parameter.
     return standard_ops.zeros((num_constraints,), dtype=dtypes.float32)
 
   def _lagrange_multipliers(self, state):
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index 3791dae..2c673d9 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -79,9 +79,11 @@
     The maximal right-eigenvector of `matrix`.
 
   Raises:
-    ValueError: If the epsilon or maximum_iterations parameters violate their
-      bounds.
+    ValueError: If the `matrix` tensor is not floating-point, or if the
+      `epsilon` or `maximum_iterations` parameters violate their bounds.
   """
+  if not matrix.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   if epsilon <= 0.0:
     raise ValueError("epsilon must be strictly positive")
   if maximum_iterations <= 0:
@@ -139,18 +141,20 @@
       (i.e. the Frobenius norm).
 
   Raises:
-    ValueError: if the `matrix` tensor does not have a fully-known shape, or is
-      not two-dimensional and square.
+    ValueError: if the `matrix` tensor is not floating-point, does not have a
+      fully-known shape, or is not two-dimensional and square.
   """
+  if not matrix.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   matrix_shape = matrix.get_shape()
-  if matrix_shape is None:
+  if matrix_shape.ndims is None:
     raise ValueError("matrix must have known shape")
   if matrix_shape.ndims != 2:
     raise ValueError(
         "matrix must be two dimensional (instead is %d-dimensional)" %
         matrix_shape.ndims)
   if matrix_shape[0] != matrix_shape[1]:
-    raise ValueError("matrix must be be square (instead has shape (%d,%d))" %
+    raise ValueError("matrix must be square (instead has shape (%d,%d))" %
                      (matrix_shape[0], matrix_shape[1]))
   dimension = matrix_shape[0].value
   if dimension is None:
@@ -172,12 +176,12 @@
         matrix, axis=0, keepdims=True)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True))
     matrix += scale * inactive
-    new_inactive = standard_ops.to_float(matrix > 0)
+    new_inactive = standard_ops.cast(matrix > 0, matrix.dtype)
     matrix *= new_inactive
     return (iteration, matrix, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
-  inactive = standard_ops.ones_like(matrix)
+  inactive = standard_ops.ones_like(matrix, dtype=matrix.dtype)
 
   # We actually want a do-while loop, so we explicitly call while_loop_body()
   # once before tf.while_loop().
@@ -218,7 +222,7 @@
   """Base class representing a `_SwapRegretOptimizer`.
 
   This class contains most of the logic for performing constrained optimization,
-  minimizing external regret for the constraints player. What it *doesn't* do is
+  minimizing swap regret for the constraints player. What it *doesn't* do is
   keep track of the internal state (the stochastic matrix).  Instead, the state
   is accessed via the _initial_state(), _stochastic_matrix(),
   _constraint_grad_and_var() and _projection_op() methods.
@@ -291,16 +295,16 @@
   def _projection_op(self, state, name=None):
     pass
 
-  def minimize_constrained(self,
-                           minimization_problem,
-                           global_step=None,
-                           var_list=None,
-                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                           aggregation_method=None,
-                           colocate_gradients_with_ops=False,
-                           name=None,
-                           grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Returns an `Operation` for minimizing the constrained problem.
 
     The `optimizer` constructor parameter will be used to update the model
     parameters, while the constraint/objective weight matrix (the analogue of
@@ -320,8 +324,11 @@
       name: as in `tf.train.Optimizer`'s `minimize` method.
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
+    Raises:
+      ValueError: If the minimization_problem tensors have different dtypes.
+
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
     objective = minimization_problem.objective
 
@@ -329,6 +336,14 @@
     proxy_constraints = minimization_problem.proxy_constraints
     if proxy_constraints is None:
       proxy_constraints = constraints
+
+    # Make sure that the objective, constraints and proxy constraints all have
+    # the same dtype.
+    if (objective.dtype.base_dtype != constraints.dtype.base_dtype or
+        objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype):
+      raise ValueError("objective, constraints and proxy_constraints must "
+                       "have the same dtype")
+
     # Flatten both constraints tensors to 1d.
     num_constraints = minimization_problem.num_constraints
     constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
@@ -344,15 +359,18 @@
         name="swap_regret_optimizer_state")
 
     zero_and_constraints = standard_ops.concat(
-        (standard_ops.zeros((1,)), constraints), axis=0)
+        (standard_ops.zeros((1,), dtype=constraints.dtype), constraints),
+        axis=0)
     objective_and_proxy_constraints = standard_ops.concat(
         (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0)
 
     distribution = self._distribution(state)
-    loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints,
-                                  1)
+    loss = standard_ops.tensordot(
+        standard_ops.cast(distribution, objective_and_proxy_constraints.dtype),
+        objective_and_proxy_constraints, 1)
     matrix_gradient = standard_ops.matmul(
-        standard_ops.expand_dims(zero_and_constraints, 1),
+        standard_ops.expand_dims(
+            standard_ops.cast(zero_and_constraints, distribution.dtype), 1),
         standard_ops.expand_dims(distribution, 0))
 
     update_ops = []
@@ -555,6 +573,7 @@
     log_initial_one = math.log(1.0 - (self._initial_multiplier_radius *
                                       (dimension - 1) / (dimension)))
     log_initial_zero = math.log(self._initial_multiplier_radius / dimension)
+    # FUTURE WORK: make the dtype a parameter.
     return standard_ops.concat(
         (standard_ops.constant(
             log_initial_one, dtype=dtypes.float32, shape=(1, dimension)),
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index 615e62b..fe5e34d 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Linear-chain CRF layer.
 
-See the @{$python/contrib.crf} guide.
+See the [CRF](https://tensorflow.org/api_guides/python/contrib.crf) guide.
 
 @@crf_binary_score
 @@crf_decode
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index f56a973..8cfe142 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -158,7 +158,7 @@
     # Test both the length-1 and regular cases.
     sequence_lengths_list = [
         np.array(3, dtype=np.int32),
-        np.array(1, dtype=np.int32)
+        np.array(1, dtype=np.int64)
     ]
     inputs_list = [
         np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
@@ -291,7 +291,7 @@
     # Test both the length-1 and regular cases.
     sequence_lengths_list = [
         np.array(3, dtype=np.int32),
-        np.array(1, dtype=np.int32)
+        np.array(1, dtype=np.int64)
     ]
     inputs_list = [
         np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 8a7ff61..2a91dcb 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -548,7 +548,9 @@
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
     # Sequence length is not allowed to be less than zero.
-    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
+    sequence_length_less_one = math_ops.maximum(
+        constant_op.constant(0, dtype=sequence_length.dtype),
+        sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index d58198f..e26d56c 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -56,7 +56,7 @@
   Cudnn RNNs have two major differences from other platform-independent RNNs tf
   provides:
   * Cudnn LSTM and GRU are mathematically different from their tf counterparts.
-    (e.g. @{tf.contrib.rnn.LSTMBlockCell} and @{tf.nn.rnn_cell.GRUCell}.
+    (e.g. `tf.contrib.rnn.LSTMBlockCell` and `tf.nn.rnn_cell.GRUCell`.
   * Cudnn-trained checkpoints are not directly compatible with tf RNNs:
     * They use a single opaque parameter buffer for the entire (possibly)
       multi-layer multi-directional RNN; Whereas tf RNN weights are per-cell and
@@ -182,7 +182,7 @@
       dropout: dropout rate, a number between [0, 1]. Dropout is applied between
           each layer (no dropout is applied for a model with a single layer).
           When set to 0, dropout is disabled.
-      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+      seed: the op seed used for initializing dropout. See `tf.set_random_seed`
           for behavior.
       dtype: tf.float16, tf.float32 or tf.float64
       kernel_initializer: starting value to initialize the weight.
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 748d7cd..2c92f31 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -61,8 +61,8 @@
 class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
   """Cudnn Compatible LSTMCell.
 
-  A simple wrapper around @{tf.contrib.rnn.LSTMBlockCell} to use along with
-  @{tf.contrib.cudnn_rnn.CudnnLSTM}. The latter's params can be used by
+  A simple wrapper around `tf.contrib.rnn.LSTMBlockCell` to use along with
+  `tf.contrib.cudnn_rnn.CudnnLSTM`. The latter's params can be used by
   this cell seamlessly.
   """
 
@@ -76,8 +76,8 @@
 class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
   """Cudnn Compatible GRUCell.
 
-  A GRU impl akin to @{tf.nn.rnn_cell.GRUCell} to use along with
-  @{tf.contrib.cudnn_rnn.CudnnGRU}. The latter's params can be used by
+  A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with
+  `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
   it seamlessly.
 
   It differs from platform-independent GRUs in how the new memory gate is
@@ -97,7 +97,7 @@
   $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
   ```
 
-  Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
+  Other GRU (see `tf.nn.rnn_cell.GRUCell` and `tf.contrib.rnn.GRUBlockCell`):
   ```python
   # new memory gate
   \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
@@ -891,7 +891,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -957,7 +957,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -998,7 +998,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1040,7 +1040,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1079,7 +1079,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1119,7 +1119,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1161,7 +1161,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1224,7 +1224,7 @@
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1282,7 +1282,7 @@
         'unidirectional' or 'bidirectional'
     dtype: one of tf.float32 or tf.float64.
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
-    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
     name: name of the operation.
   Returns:
@@ -1349,7 +1349,7 @@
           'unidirectional' or 'bidirectional'
       dtype: dtype of params, tf.float32 or tf.float64.
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+      seed: the op seed used for initializing dropout. See `tf.set_random_seed`
           for behavior.
     Raises:
       ValueError: if direction is invalid.
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 7878e46..5821d51 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -15,12 +15,12 @@
 """Experimental API for building input pipelines.
 
 This module contains experimental `Dataset` sources and transformations that can
-be used in conjunction with the @{tf.data.Dataset} API. Note that the
+be used in conjunction with the `tf.data.Dataset` API. Note that the
 `tf.contrib.data` API is not subject to the same backwards compatibility
 guarantees as `tf.data`, but we will provide deprecation advice in advance of
 removing existing functionality.
 
-See @{$guide/datasets$Importing Data} for an overview.
+See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 
 @@Counter
 @@CheckpointInputPipelineHook
diff --git a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc b/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc
index 95b8e1f..e36c9c06 100644
--- a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc
@@ -42,13 +42,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const std::vector<string>& transformations,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           transformations_(transformations),
           output_types_(output_types),
@@ -76,10 +76,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* transformations_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node));
       TF_RETURN_IF_ERROR(b->AddDataset(
@@ -121,13 +122,13 @@
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index f7e3ed8..d242cfd 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -131,7 +131,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, std::vector<string> filenames, bool header,
             string compression_type, io::ZlibCompressionOptions options,
@@ -139,7 +139,7 @@
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
             bool use_quote_delim, char delim, string na_value)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_(header),
           out_type_(output_types),
@@ -168,7 +168,8 @@
     string DebugString() const override { return "CSVDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* filenames = nullptr;
       Node* compression_type = nullptr;
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
index 6a12ca0..ccf7ec1 100644
--- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -63,11 +63,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* selector_input,
             std::vector<DatasetBase*> data_inputs)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           selector_input_(selector_input),
           data_inputs_(std::move(data_inputs)) {
       selector_input_->Ref();
@@ -110,15 +110,16 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* selector_input_node;
       TF_RETURN_IF_ERROR(
-          b->AddParentDataset(ctx, selector_input_, &selector_input_node));
+          b->AddInputDataset(ctx, selector_input_, &selector_input_node));
       std::vector<Node*> data_input_nodes(data_inputs_.size());
       for (size_t i = 0; i < data_inputs_.size(); ++i) {
         TF_RETURN_IF_ERROR(
-            b->AddParentDataset(ctx, data_inputs_[i], &data_input_nodes[i]));
+            b->AddInputDataset(ctx, data_inputs_[i], &data_input_nodes[i]));
       }
       TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, selector_input_node}},
                                        {{1, data_input_nodes}}, {}, output));
@@ -204,7 +205,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (selector_input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, selector_input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, selector_input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("selector_input_impl_empty"), ""));
@@ -212,7 +213,7 @@
         for (size_t i = 0; i < data_input_impls_.size(); ++i) {
           const auto& data_input_impl = data_input_impls_[i];
           if (data_input_impl) {
-            TF_RETURN_IF_ERROR(SaveParent(writer, data_input_impl));
+            TF_RETURN_IF_ERROR(SaveInput(writer, data_input_impl));
           } else {
             TF_RETURN_IF_ERROR(writer->WriteScalar(
                 full_name(strings::StrCat("data_input_impl_empty[", i, "]")),
@@ -226,15 +227,14 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("selector_input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, selector_input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, selector_input_impl_));
         } else {
           selector_input_impl_.reset();
         }
         for (size_t i = 0; i < data_input_impls_.size(); ++i) {
           if (!reader->Contains(full_name(
                   strings::StrCat("data_input_impl_empty[", i, "]")))) {
-            TF_RETURN_IF_ERROR(
-                RestoreParent(ctx, reader, data_input_impls_[i]));
+            TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, data_input_impls_[i]));
           } else {
             data_input_impls_[i].reset();
           }
diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index bbec506..db24e60 100644
--- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -35,10 +35,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
-        : GraphDatasetBase(ctx), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
     }
 
@@ -62,10 +62,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
       return Status::OK();
     }
@@ -106,7 +107,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         else
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impls_empty"), ""));
@@ -119,7 +120,7 @@
         if (reader->Contains(full_name("input_impls_empty")))
           input_impl_.reset();
         else
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index 32f03ca..74df1e4 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -526,6 +526,15 @@
   return clean;
 }
 
+struct HostBufferElement {
+  Status status;
+  bool end_of_sequence;
+  std::vector<Tensor> value;
+};
+
+using MultiDeviceIteratorCallback =
+    std::function<void(const HostBufferElement&)>;
+
 class MultiDeviceIterator : public ResourceBase {
  public:
   MultiDeviceIterator(const DataTypeVector& output_types,
@@ -539,83 +548,45 @@
         devices_(devices),
         flib_def_(std::move(flib_def)),
         pflr_(std::move(pflr)),
-        lib_(lib) {
-    buffer_.resize(devices_.size());
-  }
+        lib_(lib) {}
 
   string DebugString() override {
-    return strings::StrCat("MultiDeviceIterator");
+    return strings::StrCat("MultiDeviceIterator for ", devices_.size(),
+                           " devices");
   }
 
-  Status Init(std::unique_ptr<IteratorBase> iterator, int64* incarnation_id) {
-    mutex_lock l(mu_);
+  Status Init(std::unique_ptr<IteratorBase> iterator, int64 max_buffer_size,
+              int64* incarnation_id) {
     if (iterator) {
       TF_RETURN_IF_ERROR(
           VerifyTypesMatch(output_types_, iterator->output_dtypes()));
       TF_RETURN_IF_ERROR(
           VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
     }
-    host_iterator_.reset(iterator.release());
-    incarnation_id_++;
+
+    mutex_lock l(mu_);
+    if (multi_device_buffer_) {
+      multi_device_buffer_->Reset();
+    }
+
+    ++incarnation_id_;
     *incarnation_id = incarnation_id_;
-    max_buffer_size_ = 0;
-    num_elements_ = 0;
-    buffer_.clear();
-    buffer_.resize(devices_.size());
+
+    multi_device_buffer_.reset(
+        new MultiDeviceBuffer(devices_.size(), max_buffer_size, incarnation_id_,
+                              std::move(iterator)));
     return Status::OK();
   }
 
-  Status GetNextFromShard(IteratorContext* ctx, int shard_num,
-                          int64 incarnation_id,
-                          std::vector<Tensor>* out_tensors,
-                          bool* end_of_sequence) {
-    // TODO(rohanj): This might potentially strand elements in other shards.
-    // Opportunity to do smarter locking semantics.
-    mutex_lock l(mu_);
-    // Make sure we're in the right incarnation.
-    if (incarnation_id != incarnation_id_) {
-      return errors::InvalidArgument(
-          "Current incarnation: ", incarnation_id_,
-          "; Supplied incarnation: ", incarnation_id);
+  void GetNextFromShard(IteratorContext* ctx, int shard_num,
+                        int64 incarnation_id,
+                        MultiDeviceIteratorCallback callback) {
+    if (lib_ != nullptr) {
+      ctx->set_lib(lib_);
     }
-    // Then look it up in the buffer.
-    if (!buffer_[shard_num].empty()) {
-      const HostBufferElement& elem = buffer_[shard_num].front();
-      *out_tensors = elem.value;
-      *end_of_sequence = elem.end_of_sequence;
-      Status s = elem.status;
-      buffer_[shard_num].pop_front();
-      return s;
-    }
-    std::shared_ptr<IteratorBase> captured_iterator(host_iterator_);
-    if (captured_iterator) {
-      if (lib_ != nullptr) {
-        ctx->set_lib(lib_);
-      }
-      while (true) {
-        HostBufferElement elem;
-        elem.status =
-            captured_iterator->GetNext(ctx, &elem.value, &elem.end_of_sequence);
-        int buffer_index = num_elements_ % devices_.size();
-        num_elements_++;
-        if (buffer_index == shard_num) {
-          out_tensors->swap(elem.value);
-          *end_of_sequence = elem.end_of_sequence;
-          return elem.status;
-        } else {
-          buffer_[buffer_index].push_back(std::move(elem));
-          // TODO(rohanj): Put an upper bound to buffer size.
-          if (buffer_[buffer_index].size() > max_buffer_size_) {
-            max_buffer_size_ = buffer_[buffer_index].size();
-            VLOG(1) << "MultiDeviceIterator: Max buffer size increased to: "
-                    << max_buffer_size_;
-          }
-        }
-      }
-    } else {
-      return errors::FailedPrecondition("Iterator not initialized");
-    }
-    return Status::OK();
+    tf_shared_lock l(mu_);
+    multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
+                                           std::move(callback));
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -630,25 +601,218 @@
   }
 
  private:
-  struct HostBufferElement {
-    Status status;
-    bool end_of_sequence;
-    std::vector<Tensor> value;
+  // A private class that uses a background thread to keep a per device buffer
+  // full.
+  class MultiDeviceBuffer {
+   public:
+    MultiDeviceBuffer(size_t size, int64 max_buffer_size, int64 incarnation_id,
+                      std::unique_ptr<IteratorBase> host_iterator)
+        : buffer_(size),
+          size_(size),
+          max_buffer_size_(max_buffer_size),
+          incarnation_id_(incarnation_id),
+          host_iterator_(std::move(host_iterator)) {}
+
+    ~MultiDeviceBuffer() { Reset(); }
+
+    void Reset() LOCKS_EXCLUDED(mu_) {
+      {
+        mutex_lock l(mu_);
+        if (background_thread_finished_) {
+          return;
+        }
+
+        cancelled_ = true;
+        // Wake up the background thread.
+        for (int i = 0; i < size_; ++i) {
+          buffer_[i].cond_var.notify_all();
+        }
+
+        // Make sure background thread has finished first.
+        while (!background_thread_finished_) {
+          shutdown_cond_var_.wait(l);
+        }
+      }
+      RunPendingCallbacks();
+    }
+
+    void GetNextFromShard(IteratorContext* ctx, int shard_num,
+                          int64 incarnation_id,
+                          MultiDeviceIteratorCallback callback) {
+      HostBufferElement elem;
+      if (incarnation_id_ != incarnation_id) {
+        elem.status = errors::InvalidArgument("Invalid incarnation id");
+        callback(elem);
+        return;
+      }
+
+      bool produced_output = false;
+      {
+        mutex_lock l(mu_);
+        if (cancelled_) {
+          elem.status = errors::Cancelled("Cancelled Multidevice iterator");
+          callback(elem);
+          return;
+        }
+
+        EnsureBackgroundThreadStarted(ctx);
+
+        if (!buffer_[shard_num].data.empty()) {
+          produced_output = true;
+          std::swap(elem, buffer_[shard_num].data.front());
+          buffer_[shard_num].data.pop_front();
+          // Wake up background thread if it is blocked on this element.
+          if (buffer_[shard_num].data.size() == max_buffer_size_ - 1) {
+            buffer_[shard_num].cond_var.notify_all();
+          }
+        } else {
+          if (background_thread_finished_) {
+            produced_output = true;
+            elem.end_of_sequence = true;
+          } else {
+            buffer_[shard_num].callbacks.push_back(std::move(callback));
+            callback = nullptr;
+          }
+        }
+      }
+
+      if (produced_output) {
+        callback(elem);
+      }
+    }
+
+   private:
+    void EnsureBackgroundThreadStarted(IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (!background_thread_) {
+        background_thread_.reset(ctx->env()->StartThread(
+            {}, "multi_device_iterator_background_thread",
+            std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
+                      this, new IteratorContext(*ctx))));
+      }
+    }
+
+    void RunPendingCallbacks() LOCKS_EXCLUDED(mu_) {
+      // Run all remaining callbacks.
+      std::vector<MultiDeviceIteratorCallback> cancellation_callbacks;
+      std::vector<HostBufferElement> cancellation_elements;
+      {
+        mutex_lock l(mu_);
+
+        for (int i = 0; i < size_; ++i) {
+          while (!buffer_[i].callbacks.empty()) {
+            if (buffer_[i].data.empty()) {
+              HostBufferElement elem;
+              elem.status =
+                  errors::Cancelled("Cancelled and buffer not filled.");
+              cancellation_elements.push_back(std::move(elem));
+            } else {
+              cancellation_elements.push_back(
+                  std::move(buffer_[i].data.front()));
+              buffer_[i].data.pop_front();
+            }
+            cancellation_callbacks.push_back(
+                std::move(buffer_[i].callbacks.front()));
+            buffer_[i].callbacks.pop_front();
+          }
+        }
+      }
+      for (int i = 0; i < cancellation_callbacks.size(); ++i) {
+        cancellation_callbacks[i](cancellation_elements[i]);
+      }
+    }
+
+    void BackgroundThread(IteratorContext* ctx) {
+      std::unique_ptr<IteratorContext> cleanup(ctx);
+      int shard_to_fetch = 0;
+      while (true) {
+        HostBufferElement elem;
+        MultiDeviceIteratorCallback callback = nullptr;
+        bool end_of_iterator = false;
+
+        {
+          mutex_lock l(mu_);
+          while (!cancelled_ &&
+                 buffer_[shard_to_fetch].data.size() >= max_buffer_size_) {
+            buffer_[shard_to_fetch].cond_var.wait(l);
+          }
+
+          if (cancelled_) {
+            background_thread_finished_ = true;
+            shutdown_cond_var_.notify_all();
+            return;
+          }
+        }
+
+        elem.status =
+            host_iterator_->GetNext(ctx, &elem.value, &elem.end_of_sequence);
+
+        if (elem.status.ok() && elem.end_of_sequence) {
+          end_of_iterator = true;
+        }
+
+        {
+          mutex_lock l(mu_);
+          // Try to find a callback, else just push stuff into buffer.
+          if (!buffer_[shard_to_fetch].callbacks.empty()) {
+            callback = buffer_[shard_to_fetch].callbacks.front();
+            buffer_[shard_to_fetch].callbacks.pop_front();
+          } else {
+            buffer_[shard_to_fetch].data.push_back(std::move(elem));
+            elem = HostBufferElement();
+          }
+        }
+
+        if (callback) {
+          (*ctx->runner())(std::bind(std::move(callback), std::move(elem)));
+        }
+
+        // Finish off the thread if we reach the end of the iterator. Runs
+        // pending callbacks.
+        if (end_of_iterator) {
+          {
+            mutex_lock l(mu_);
+            background_thread_finished_ = true;
+            shutdown_cond_var_.notify_all();
+          }
+          RunPendingCallbacks();
+          return;
+        }
+        shard_to_fetch = (shard_to_fetch + 1) % size_;
+      }
+    }
+
+    struct HostBuffer {
+      condition_variable cond_var;
+      std::deque<HostBufferElement> data;
+      std::deque<MultiDeviceIteratorCallback> callbacks;
+    };
+
+    mutex mu_;
+    std::unique_ptr<Thread> background_thread_ GUARDED_BY(mu_);
+    bool background_thread_finished_ GUARDED_BY(mu_) = false;
+    bool cancelled_ GUARDED_BY(mu_) = false;
+    condition_variable shutdown_cond_var_ GUARDED_BY(mu_);
+
+    std::vector<HostBuffer> buffer_;
+
+    const size_t size_;
+    const int64 max_buffer_size_;
+    const int64 incarnation_id_;
+    const std::unique_ptr<IteratorBase> host_iterator_;
   };
 
   mutex mu_;
   const DataTypeVector output_types_;
   const std::vector<PartialTensorShape> output_shapes_;
   const std::vector<string> devices_;
-  int64 num_elements_ GUARDED_BY(mu_) = 0;
-  int64 max_buffer_size_ GUARDED_BY(mu_) = 0;
-  int64 incarnation_id_ GUARDED_BY(mu_) = 0;
-  std::vector<std::deque<HostBufferElement>> buffer_ GUARDED_BY(mu_);
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
-  std::shared_ptr<IteratorBase> host_iterator_;
+  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* const lib_ = nullptr;  // not owned.
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+
+  int64 incarnation_id_ GUARDED_BY(mu_) = 0;
+  std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_);
 };
 
 // Just creates a MultiDeviceIterator and returns it.
@@ -754,6 +918,10 @@
       : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
+    const Tensor* tensor_max_buffer_size;
+    OP_REQUIRES_OK(ctx, ctx->input("max_buffer_size", &tensor_max_buffer_size));
+    int64 max_buffer_size = tensor_max_buffer_size->scalar<int64>()();
+
     DatasetBase* dataset;
     OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
     MultiDeviceIterator* resource;
@@ -761,12 +929,12 @@
                    LookupResource(ctx, HandleFromInput(ctx, 1), &resource));
     core::ScopedUnref unref(resource);
 
-    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
     std::unique_ptr<IteratorBase> iterator;
-    OP_REQUIRES_OK(ctx,
-                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(ctx), "Iterator",
+                                              &iterator));
     int64 incarnation_id;
-    OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), &incarnation_id));
+    OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), max_buffer_size,
+                                       &incarnation_id));
     Tensor tensor_incarnation_id(DT_INT64, TensorShape({}));
     tensor_incarnation_id.scalar<int64>()() = incarnation_id;
     OP_REQUIRES_OK(ctx,
@@ -804,9 +972,6 @@
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
     thread_pool_->Schedule(std::bind(
         [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
-          std::vector<Tensor> components;
-          bool end_of_sequence = false;
-
           IteratorContext::Params params;
           params.env = ctx->env();
           params.runner = *(ctx->runner());
@@ -817,22 +982,26 @@
           };
           IteratorContext iter_ctx(std::move(params));
 
-          Status s =
-              iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
-                                         &components, &end_of_sequence);
-          iterator->Unref();
+          MultiDeviceIteratorCallback callback = std::bind(
+              [ctx](const HostBufferElement& elem, DoneCallback done) {
+                // iterator->Unref();
+                Status s = elem.status;
+                if (!s.ok()) {
+                  ctx->SetStatus(s);
+                } else if (elem.end_of_sequence) {
+                  ctx->SetStatus(errors::OutOfRange("End of sequence"));
+                } else {
+                  for (int i = 0; i < elem.value.size(); ++i) {
+                    ctx->set_output(i, elem.value[i]);
+                  }
+                }
+                done();
+              },
+              std::placeholders::_1, std::move(done));
 
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (end_of_sequence) {
-            ctx->SetStatus(errors::OutOfRange("End of sequence"));
-          } else {
-            for (int i = 0; i < components.size(); ++i) {
-              // TODO(mrry): Check that the shapes match the shape attrs.
-              ctx->set_output(i, components[i]);
-            }
-          }
-          done();
+          iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
+                                     callback);
+          iterator->Unref();
         },
         std::move(done)));
   }
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
index 141706f..ab58450 100644
--- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
@@ -130,11 +130,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             ThreadPoolResource* threadpool)
-        : GraphDatasetBase(ctx), input_(input), threadpool_(threadpool) {
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          threadpool_(threadpool) {
       input_->Ref();
       threadpool_->Ref();
     }
@@ -162,11 +164,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented(
-          "Cannot currently serialize the thread pool for a "
-          "ThreadPoolDataset.");
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
     }
 
    private:
diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
index 67c2377..6fbf5d2 100644
--- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
@@ -47,10 +47,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input)
-        : GraphDatasetBase(ctx), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
     }
 
@@ -75,10 +75,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
       return Status::OK();
     }
@@ -116,7 +117,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
@@ -135,7 +136,7 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index 66a7c7f..cc5e250 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -168,9 +168,11 @@
 REGISTER_OP("MultiDeviceIteratorInit")
     .Input("dataset: variant")
     .Input("multi_device_iterator: resource")
+    .Input("max_buffer_size: int64")
     .Output("incarnation_id: int64")
     .Doc(R"doc(
 Initializes the multi device iterator with the given dataset.
+max_buffer_size: The maximum size of the host side per device buffer to keep.
 incarnation_id: An int64 indicating which incarnation of the MultiDeviceIterator
   is running.
 dataset: Dataset to be iterated upon.
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 24c7ee6..2b75aa2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -175,7 +175,7 @@
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -206,6 +206,25 @@
 )
 
 py_test(
+    name = "map_defun_op_test",
+    size = "small",
+    srcs = ["map_defun_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:map_defun",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_test(
     name = "optimize_dataset_op_test",
     size = "small",
     srcs = ["optimize_dataset_op_test.py"],
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
new file mode 100644
index 0000000..a711325
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MapDefunOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import map_defun
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MapDefunTest(test.TestCase):
+
+  def testMapDefun_Simple(self):
+
+    @function.Defun(dtypes.int32)
+    def simple_fn(x):
+      return x * 2 + 3
+
+    with self.test_session():
+      nums = [[1, 2], [3, 4], [5, 6]]
+      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+      r = map_defun.map_defun(simple_fn, [elems], [dtypes.int32], [(2,)])[0]
+      expected = elems * 2 + 3
+      self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
+
+  def testMapDefun_MismatchedTypes(self):
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return math_ops.cast(x, dtypes.float64)
+
+    with self.test_session():
+      nums = [1, 2, 3, 4, 5, 6]
+      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+      r = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])[0]
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(r)
+
+  def testMapDefun_MultipleOutputs(self):
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return (x, math_ops.cast(x * 2 + 3, dtypes.float64))
+
+    with self.test_session():
+      nums = [[1, 2], [3, 4], [5, 6]]
+      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+      r = map_defun.map_defun(fn, [elems], [dtypes.int32, dtypes.float64],
+                              [(2,), (2,)])
+      expected = [elems, elems * 2 + 3]
+      self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
+
+  def testMapDefun_ShapeInference(self):
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return x
+
+    nums = [[1, 2], [3, 4], [5, 6]]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0]
+    self.assertEqual(result.get_shape(), (3, 2))
+
+  def testMapDefun_PartialShapeInference(self):
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return x
+
+    elems = array_ops.placeholder(dtypes.int64, (None, 2))
+    result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])
+    self.assertEqual(result[0].get_shape().as_list(), [None, 2])
+
+  def testMapDefun_RaisesErrorOnRuntimeShapeMismatch(self):
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def fn(x, y):
+      return x, y
+
+    elems1 = array_ops.placeholder(dtypes.int32)
+    elems2 = array_ops.placeholder(dtypes.int32)
+    result = map_defun.map_defun(fn, [elems1, elems2],
+                                 [dtypes.int32, dtypes.int32], [(), ()])
+    with self.test_session() as sess:
+      with self.assertRaisesWithPredicateMatch(
+          errors.InvalidArgumentError,
+          "All inputs must have the same dimension 0."):
+        sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]})
+
+  def testMapDefun_RaisesDefunError(self):
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
+        return array_ops.identity(x)
+
+    elems = constant_op.constant([0, 0, 0, 37, 0])
+    result = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])
+    with self.test_session():
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index d66305d..361fe0d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -1021,7 +1021,7 @@
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = prefetching_ops.MultiDeviceIterator(
-        dataset, ["/cpu:1", "/cpu:2"])
+        dataset, ["/cpu:1", "/cpu:2"], max_buffer_size=4)
     elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
@@ -1079,7 +1079,7 @@
     with compat.forward_compatibility_horizon(2018, 8, 4):
       dataset = dataset_ops.Dataset.range(10)
       multi_device_iterator = prefetching_ops.MultiDeviceIterator(
-          dataset, ["/cpu:1", "/gpu:0"])
+          dataset, ["/cpu:1", "/gpu:0"], max_buffer_size=4)
       elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
       config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 1ad021e..ad9378d 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -211,6 +211,17 @@
 )
 
 py_library(
+    name = "map_defun",
+    srcs = ["map_defun.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
     name = "resampling",
     srcs = ["resampling.py"],
     srcs_version = "PY2AND3",
@@ -370,6 +381,7 @@
         ":get_single_element",
         ":grouping",
         ":interleave_ops",
+        ":map_defun",
         ":optimization",
         ":prefetching_ops",
         ":readers",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 4835c4e..9f05994 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -185,7 +185,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -401,7 +401,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -443,7 +443,7 @@
 def batch_and_drop_remainder(batch_size):
   """A batching transformation that omits the final small batch (if present).
 
-  Like @{tf.data.Dataset.batch}, this transformation combines
+  Like `tf.data.Dataset.batch`, this transformation combines
   consecutive elements of this dataset into batches. However, if the batch
   size does not evenly divide the input dataset size, this transformation will
   drop the final smaller element.
@@ -467,7 +467,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}
+    `tf.data.Dataset.apply`
   """
 
   def _apply_fn(dataset):
@@ -484,25 +484,25 @@
                                     padding_values=None):
   """A batching and padding transformation that omits the final small batch.
 
-  Like @{tf.data.Dataset.padded_batch}, this transformation combines
+  Like `tf.data.Dataset.padded_batch`, this transformation combines
   consecutive elements of this dataset into batches. However, if the batch
   size does not evenly divide the input dataset size, this transformation will
   drop the final smaller element.
 
-  See `@{tf.contrib.data.batch_and_drop_remainder}` for more details.
+  See `tf.contrib.data.batch_and_drop_remainder` for more details.
 
   Args:
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements of this dataset to combine in a single batch.
     padded_shapes: A nested structure of `tf.TensorShape` or
       `tf.int64` vector tensor-like objects. See
-      @{tf.data.Dataset.padded_batch} for details.
+      `tf.data.Dataset.padded_batch` for details.
     padding_values: (Optional.) A nested structure of scalar-shaped
-      `tf.Tensor`. See @{tf.data.Dataset.padded_batch} for details.
+      `tf.Tensor`. See `tf.data.Dataset.padded_batch` for details.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}
+    `tf.data.Dataset.apply`
   """
 
   def _apply_fn(dataset):
@@ -661,7 +661,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}
+    `tf.data.Dataset.apply`
   """
 
   def _check_shape(*elements):
@@ -760,7 +760,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
 
   Raises:
     ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
diff --git a/tensorflow/contrib/data/python/ops/enumerate_ops.py b/tensorflow/contrib/data/python/ops/enumerate_ops.py
index ac2b386..490281e 100644
--- a/tensorflow/contrib/data/python/ops/enumerate_ops.py
+++ b/tensorflow/contrib/data/python/ops/enumerate_ops.py
@@ -47,7 +47,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index d46d96c..b4a7521 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -42,7 +42,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
index ef92844..a6713b0 100644
--- a/tensorflow/contrib/data/python/ops/get_single_element.py
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -29,8 +29,8 @@
 def get_single_element(dataset):
   """Returns the single element in `dataset` as a nested structure of tensors.
 
-  This function enables you to use a @{tf.data.Dataset} in a stateless
-  "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}.
+  This function enables you to use a `tf.data.Dataset` in a stateless
+  "tensor-in tensor-out" expression, without creating a `tf.data.Iterator`.
   This can be useful when your preprocessing transformations are expressed
   as a `Dataset`, and you want to use the transformation at serving time.
   For example:
@@ -50,10 +50,10 @@
   ```
 
   Args:
-    dataset: A @{tf.data.Dataset} object containing a single element.
+    dataset: A `tf.data.Dataset` object containing a single element.
 
   Returns:
-    A nested structure of @{tf.Tensor} objects, corresponding to the single
+    A nested structure of `tf.Tensor` objects, corresponding to the single
     element of `dataset`.
 
   Raises:
@@ -77,11 +77,11 @@
   """Returns the result of reducing the `dataset` using `reducer`.
 
   Args:
-    dataset: A @{tf.data.Dataset} object.
-    reducer: A @{tf.contrib.data.Reducer} object representing the reduce logic.
+    dataset: A `tf.data.Dataset` object.
+    reducer: A `tf.contrib.data.Reducer` object representing the reduce logic.
 
   Returns:
-    A nested structure of @{tf.Tensor} objects, corresponding to the result
+    A nested structure of `tf.Tensor` objects, corresponding to the result
     of reducing `dataset` using `reducer`.
 
   Raises:
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index bd8d398..6edc1d7 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -50,7 +50,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -92,7 +92,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
 
   Raises:
     ValueError: if neither or both of {`window_size`, `window_size_func`} are
@@ -142,11 +142,11 @@
     bucket_batch_sizes: `list<int>`, batch size per bucket. Length should be
       `len(bucket_boundaries) + 1`.
     padded_shapes: Nested structure of `tf.TensorShape` to pass to
-      @{tf.data.Dataset.padded_batch}. If not provided, will use
+      `tf.data.Dataset.padded_batch`. If not provided, will use
       `dataset.output_shapes`, which will result in variable length dimensions
       being padded out to the maximum length in each batch.
     padding_values: Values to pad with, passed to
-      @{tf.data.Dataset.padded_batch}. Defaults to padding with 0.
+      `tf.data.Dataset.padded_batch`. Defaults to padding with 0.
     pad_to_bucket_boundary: bool, if `False`, will pad dimensions with unknown
       size to maximum length in batch. If `True`, will pad dimensions with
       unknown size to bucket boundary minus 1 (i.e., the maximum length in each
@@ -155,7 +155,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
 
   Raises:
     ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`.
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index bcc9595..5a1a351 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -42,7 +42,7 @@
 
   `parallel_interleave()` maps `map_func` across its input to produce nested
   datasets, and outputs their elements interleaved. Unlike
-  @{tf.data.Dataset.interleave}, it gets elements from `cycle_length` nested
+  `tf.data.Dataset.interleave`, it gets elements from `cycle_length` nested
   datasets in parallel, which increases the throughput, especially in the
   presence of stragglers. Furthermore, the `sloppy` argument can be used to
   improve performance, by relaxing the requirement that the outputs are produced
@@ -79,7 +79,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     return readers.ParallelInterleaveDataset(
@@ -138,7 +138,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     return readers.ParallelInterleaveDataset(
@@ -196,15 +196,15 @@
   """Samples elements at random from the datasets in `datasets`.
 
   Args:
-    datasets: A list of @{tf.data.Dataset} objects with compatible structure.
+    datasets: A list of `tf.data.Dataset` objects with compatible structure.
     weights: (Optional.) A list of `len(datasets)` floating-point values where
       `weights[i]` represents the probability with which an element should be
-      sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each
+      sampled from `datasets[i]`, or a `tf.data.Dataset` object where each
       element is such a list. Defaults to a uniform distribution across
       `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
 
   Returns:
     A dataset that interleaves elements from `datasets` at random, according to
@@ -262,8 +262,8 @@
   ```
 
   Args:
-    datasets: A list of @{tf.data.Dataset} objects with compatible structure.
-    choice_dataset: A @{tf.data.Dataset} of scalar `tf.int64` tensors between
+    datasets: A list of `tf.data.Dataset` objects with compatible structure.
+    choice_dataset: A `tf.data.Dataset` of scalar `tf.int64` tensors between
       `0` and `len(datasets) - 1`.
 
   Returns:
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index d2c1d0d..18515e2 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -118,7 +118,7 @@
      pipeline.
 
   For saving the input pipeline checkpoint alongside the model weights use
-  @{tf.contrib.data.make_saveable_from_iterator} directly to create a
+  `tf.contrib.data.make_saveable_from_iterator` directly to create a
   `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however,
   that you will need to be careful not to restore the training iterator during
   eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS
diff --git a/tensorflow/contrib/data/python/ops/map_defun.py b/tensorflow/contrib/data/python/ops/map_defun.py
new file mode 100644
index 0000000..54d5cd6
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/map_defun.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for optimizing `tf.data` pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def map_defun(fn, elems, output_dtypes, output_shapes):
+  """Map a function on the list of tensors unpacked from `elems` on dimension 0.
+
+  Args:
+    fn: A function (`function.Defun`) that takes a list of tensors and returns
+      another list of tensors. The output list has the same types as
+      output_dtypes. The elements of the output list have the same dimension 0
+      as `elems`, and the remaining dimensions correspond to those of
+      `fn_output_shapes`.
+    elems: A list of tensors.
+    output_dtypes: A list of dtypes corresponding to the output types of the
+      function.
+    output_shapes: A list of `TensorShape`s corresponding to the output
+      shapes from each invocation of the function on slices of inputs.
+
+  Raises:
+    ValueError: if any of the inputs are malformed.
+
+  Returns:
+    A list of `Tensor` objects with the same types as `output_dtypes`.
+  """
+  if not isinstance(elems, list):
+    raise ValueError("`elems` must be a list of tensors.")
+  if not isinstance(output_dtypes, list):
+    raise ValueError("`output_dtypes` must be a list of tensors.")
+  if not isinstance(output_shapes, list):
+    raise ValueError("`output_shapes` must be a list of tensors.")
+
+  elems = [ops.convert_to_tensor(e) for e in elems]
+  output_shapes = [tensor_shape.TensorShape(s) for s in output_shapes]
+  if not all(s.is_fully_defined() for s in output_shapes):
+    raise ValueError("All fn output shapes must be fully defined.")
+  return gen_dataset_ops.map_defun(elems, output_dtypes, output_shapes, fn)
diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py
index 018c511..fa1b851 100644
--- a/tensorflow/contrib/data/python/ops/optimization.py
+++ b/tensorflow/contrib/data/python/ops/optimization.py
@@ -36,7 +36,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -56,7 +56,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 0243c72..5222011 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -92,7 +92,7 @@
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
 
   Args:
     input_dataset: The input dataset
@@ -158,7 +158,7 @@
             self._input_dataset)
 
   def get_next(self, name=None):
-    """See @{tf.data.Iterator.get_next}."""
+    """See `tf.data.Iterator.get_next`."""
     self._get_next_call_count += 1
     if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
@@ -199,7 +199,7 @@
 
 
 class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
 
   Args:
     input_dataset: The input dataset
@@ -334,7 +334,7 @@
 def prefetch_to_device(device, buffer_size=None):
   """A transformation that prefetches dataset values to the given `device`.
 
-  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  NOTE: Although the transformation creates a `tf.data.Dataset`, the
   transformation must be the final `Dataset` in the input pipeline.
 
   Args:
@@ -344,7 +344,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     return _PrefetchToDeviceDataset(dataset, device, buffer_size)
@@ -361,7 +361,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -631,6 +631,7 @@
   def __init__(self,
                dataset,
                devices,
+               max_buffer_size=1,
                prefetch_buffer_size=1,
                source_device="/cpu:0"):
     """Constructs a MultiDeviceIterator.
@@ -638,6 +639,7 @@
     Args:
       dataset: The input dataset to be iterated over.
       devices: The list of devices to fetch data to.
+      max_buffer_size: Maximum size of the host side per device buffer to keep.
       prefetch_buffer_size: if > 1, then we setup a buffer on each device
         to prefetch into.
       source_device: The host device to place the `dataset` on.
@@ -668,7 +670,8 @@
       # iterators and the multi-device iterator.
       self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
           self._dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          self._multi_device_iterator_resource)
+          self._multi_device_iterator_resource,
+          max_buffer_size=max_buffer_size)
 
     # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
     # initialize the device side of the pipeline. This would allow the
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 14d69f8..3882d4b 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -234,7 +234,7 @@
 
   Args:
     file_pattern: List of files or patterns of TFRecord file paths.
-      See @{tf.gfile.Glob} for pattern rules.
+      See `tf.gfile.Glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     parser_fn: (Optional.) A function accepting string input to parse
@@ -340,7 +340,7 @@
 
   Args:
     file_pattern: List of files or patterns of file paths containing CSV
-      records. See @{tf.gfile.Glob} for pattern rules.
+      records. See `tf.gfile.Glob` for pattern rules.
     batch_size: An int representing the number of records to combine
       in a single batch.
     column_names: An optional list of strings that corresponds to the CSV
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index 182a5c6..75642f1 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -50,7 +50,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index ea9dcfe..6b002b4 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -151,7 +151,7 @@
 def scan(initial_state, scan_func):
   """A transformation that scans a function across an input dataset.
 
-  This transformation is a stateful relative of @{tf.data.Dataset.map}.
+  This transformation is a stateful relative of `tf.data.Dataset.map`.
   In addition to mapping `scan_func` across the elements of the input dataset,
   `scan()` accumulates one or more state tensors, whose initial values are
   `initial_state`.
@@ -166,7 +166,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     return _ScanDataset(dataset, initial_state, scan_func)
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
index d7f8a73..4356721 100644
--- a/tensorflow/contrib/data/python/ops/shuffle_ops.py
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -92,11 +92,11 @@
       indefinitely.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):  # pylint: disable=missing-docstring
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index e9dd745..8025dcd 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -109,7 +109,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
 
   Raises:
     ValueError: if invalid arguments are provided.
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 97931f7..3b4e981 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -29,7 +29,7 @@
   """A stateful resource that aggregates statistics from one or more iterators.
 
   To record statistics, use one of the custom transformation functions defined
-  in this module when defining your @{tf.data.Dataset}. All statistics will be
+  in this module when defining your `tf.data.Dataset`. All statistics will be
   aggregated by the `StatsAggregator` that is associated with a particular
   iterator (see below). For example, to record the total number of bytes
   produced by iterating over a dataset:
@@ -39,7 +39,7 @@
   dataset = dataset.apply(stats_ops.bytes_produced_stats("total_bytes"))
   ```
 
-  To associate a `StatsAggregator` with a @{tf.data.Iterator} object, use
+  To associate a `StatsAggregator` with a `tf.data.Iterator` object, use
   the following pattern:
 
   ```python
@@ -55,7 +55,7 @@
 
   To get a protocol buffer summary of the currently aggregated statistics,
   use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
-  is to add the returned tensor to the @{tf.GraphKeys.SUMMARIES} collection,
+  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
   so that the summaries will be included with any existing summaries.
 
   ```python
@@ -74,13 +74,13 @@
     self._resource = gen_dataset_ops.stats_aggregator_handle()
 
   def get_summary(self):
-    """Returns a string @{tf.Tensor} that summarizes the aggregated statistics.
+    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
 
-    The returned tensor will contain a serialized @{tf.summary.Summary} protocol
+    The returned tensor will contain a serialized `tf.summary.Summary` protocol
     buffer, which can be used with the standard TensorBoard logging facilities.
 
     Returns:
-      A scalar string @{tf.Tensor} that summarizes the aggregated statistics.
+      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
     """
     return gen_dataset_ops.stats_aggregator_summary(self._resource)
 
@@ -122,7 +122,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -145,7 +145,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -169,7 +169,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
@@ -192,7 +192,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/threadpool.py b/tensorflow/contrib/data/python/ops/threadpool.py
index 9af1e78..dc67acc 100644
--- a/tensorflow/contrib/data/python/ops/threadpool.py
+++ b/tensorflow/contrib/data/python/ops/threadpool.py
@@ -100,6 +100,6 @@
   Returns:
     A dataset containing the same values as `dataset`, but which uses
     `thread_pool` to compute any of its parallel operations (such as
-    @{tf.data.Dataset.map}).
+    `tf.data.Dataset.map`).
   """
   return _ThreadPoolDataset(dataset, thread_pool)
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
index e0ce0a4..e0d6063 100644
--- a/tensorflow/contrib/data/python/ops/unique.py
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -38,7 +38,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py
index f53bd3f..c455fdc 100644
--- a/tensorflow/contrib/data/python/ops/writers.py
+++ b/tensorflow/contrib/data/python/ops/writers.py
@@ -38,13 +38,13 @@
         argument_dtype=dtypes.string)
 
   def write(self, dataset):
-    """Returns a @{tf.Operation} to write a dataset to a file.
+    """Returns a `tf.Operation` to write a dataset to a file.
 
     Args:
-      dataset: a @{tf.data.Dataset} whose elements are to be written to a file
+      dataset: a `tf.data.Dataset` whose elements are to be written to a file
 
     Returns:
-      A @{tf.Operation} that, when run, writes contents of `dataset` to a file.
+      A `tf.Operation` that, when run, writes contents of `dataset` to a file.
     """
     if not isinstance(dataset, dataset_ops.Dataset):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index d3628d4..c16f1d6 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -29,7 +29,6 @@
         "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
-        "//tensorflow/contrib/distribute/python:multi_worker_strategy",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
         "//tensorflow/contrib/distribute/python:parameter_server_strategy",
         "//tensorflow/contrib/distribute/python:step_fn",
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 9123ca7..588a4f2 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -22,13 +22,13 @@
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
 from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
-from tensorflow.contrib.distribute.python.multi_worker_strategy import MultiWorkerMirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
 from tensorflow.python.training.distribute import *
+from tensorflow.python.training.distribution_strategy_context import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -39,7 +39,6 @@
     'CrossTowerOps',
     'DistributionStrategy',
     'MirroredStrategy',
-    'MultiWorkerMirroredStrategy',
     'Monitor',
     'OneDeviceStrategy',
     'ParameterServerStrategy',
@@ -55,6 +54,7 @@
     'get_tower_context',
     'has_distribution_strategy',
     'require_tower_context',
+    'UpdateContext',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index d9e66dd..59efd17 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -57,7 +57,7 @@
         "//tensorflow/python/eager:context",
         "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
     ],
     tags = [
         "no_pip",
@@ -72,31 +72,21 @@
         ":cross_tower_ops",
         ":shared_variable_creator",
         ":values",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device",
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "multi_worker_strategy",
-    srcs = ["multi_worker_strategy.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":mirrored_strategy",
-        ":values",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -114,6 +104,7 @@
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:multi_worker_util",
     ],
 )
 
@@ -184,9 +175,9 @@
     ],
     deps = [
         ":mirrored_strategy",
-        ":multi_worker_strategy",
         ":one_device_strategy",
         ":tpu_strategy",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
@@ -218,9 +209,13 @@
     ],
     deps = [
         ":mirrored_strategy",
+        ":multi_worker_test_base",
         ":strategy_test_lib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -266,7 +261,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -314,7 +309,7 @@
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:session",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
 )
@@ -369,7 +364,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -439,11 +434,7 @@
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:dnn_linear_combined",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
@@ -469,17 +460,27 @@
     ],
 )
 
+py_library(
+    name = "step_fn_test_lib",
+    testonly = 1,
+    srcs = ["step_fn_test.py"],
+    deps = [
+        ":combinations",
+        ":single_loss_example",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "step_fn_test",
     srcs = ["step_fn_test.py"],
     additional_deps = [
-        ":single_loss_example",
-        ":combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
+        ":step_fn_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
@@ -680,8 +681,7 @@
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:keras",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
     ],
     tags = [
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 52f73dd..2fbadfe 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -46,8 +46,8 @@
 from absl.testing import parameterized
 import six
 
+from tensorflow.contrib.cluster_resolver import TPUClusterResolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
-from tensorflow.contrib.distribute.python import multi_worker_strategy
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
@@ -55,7 +55,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adam
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import tf_inspect
 
@@ -319,12 +319,15 @@
 # pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
-    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
+    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
-tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True)
+tpu_strategy = NamedDistribution(
+    "TPU", lambda: tpu_lib.TPUStrategy(
+        TPUClusterResolver(""), steps_per_run=5),
+    required_tpu=True)
 # Note that we disable prefetching for testing since prefetching makes
 # the input non-deterministic.
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
@@ -340,42 +343,44 @@
 
 multi_worker_strategy_with_cpu = NamedDistribution(
     "MultiWorkerCPU",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
+    lambda: mirrored_lib.MirroredStrategy(
+        cluster_spec={
             "worker": [
                 "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
             ]
         },
-        num_gpus_per_worker=0), 0)
+        num_gpus=0), 0)
 multi_worker_strategy_with_one_gpu = NamedDistribution(
     "MultiWorker1GPU",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
+    lambda: mirrored_lib.MirroredStrategy(
+        cluster_spec={
             "worker": [
                 "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
             ]
         },
-        num_gpus_per_worker=1), 1)
+        num_gpus=1), 1)
 multi_worker_strategy_with_two_gpus = NamedDistribution(
     "MultiWorker2GPUs",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
+    lambda: mirrored_lib.MirroredStrategy(
+        cluster_spec={
             "worker": [
                 "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
             ]
         },
-        num_gpus_per_worker=2), 2)
+        num_gpus=2), 2)
 
 adam_optimizer_v1_fn = NamedObject(
     "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
 gradient_descent_optimizer_v1_fn = NamedObject(
     "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
+optimizers_v1 = [adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn]
 
 adam_optimizer_v2_fn = NamedObject(
     "AdamV2", lambda: adam_v2.AdamOptimizer(0.2, epsilon=1))
 gradient_descent_optimizer_v2_fn = NamedObject(
     "GradientDescentV2",
     lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
+optimizers_v2 = [adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn]
 
 graph_and_eager_modes = ["graph", "eager"]
 
@@ -387,7 +392,7 @@
           one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
           mirrored_strategy_with_two_gpus
       ],
-      optimizer_fn=[adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn])
+      optimizer_fn=optimizers_v1)
 
 
 def distributions_and_v2_optimizers():
@@ -397,4 +402,4 @@
           one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
           mirrored_strategy_with_two_gpus
       ],
-      optimizer_fn=[adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn])
+      optimizer_fn=optimizers_v2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index 9b55343..3a7addf 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -157,7 +157,7 @@
 
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
-        are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}.
+        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
       per_device_value: a PerDevice object.
       destinations: the reduction destinations.
 
@@ -181,7 +181,7 @@
 
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
-        are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}.
+        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
       value_destination_pairs: a list or a tuple of tuples of PerDevice objects
         and destinations. If a destination is None, then the destinations
         are set to match the devices of the input PerDevice object.
@@ -305,7 +305,7 @@
       cross_tower_utils.aggregate_gradients_using*.
     destinations: a list of device strings for returned Mirrored objects.
     aggregation: Indicates how a variable will be aggregated. Accepted values
-      are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}.
+      are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
     num_between_graph_workers: number of workers in the between-graph
       replication.
 
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index a0bb144..cc626c3 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -29,6 +29,7 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
@@ -63,8 +64,9 @@
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
               combinations.mirrored_strategy_with_two_gpus
-          ]))
-  def test_complete_flow_with_mode(self, distribution):
+          ],
+          use_train_and_evaluate=[True, False]))
+  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
     label_dimension = 2
     input_dimension = label_dimension
     batch_size = 10
@@ -75,8 +77,11 @@
         y=data,
         batch_size=batch_size // len(distribution.worker_devices),
         shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, y=data, batch_size=batch_size, shuffle=False)
+    eval_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices),
+        shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
 
@@ -100,9 +105,15 @@
             train_distribute=distribution, eval_distribute=distribution))
 
     num_steps = 10
-    estimator.train(train_input_fn, steps=num_steps)
+    if use_train_and_evaluate:
+      scores, _ = training.train_and_evaluate(
+          estimator,
+          training.TrainSpec(train_input_fn, max_steps=num_steps),
+          training.EvalSpec(eval_input_fn))
+    else:
+      estimator.train(train_input_fn, steps=num_steps)
+      scores = estimator.evaluate(eval_input_fn)
 
-    scores = estimator.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
 
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index ec0ca68..4facd72 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -241,6 +241,47 @@
                 validation_data=dataset, validation_steps=2)
       model.predict(dataset, steps=2)
 
+  def test_fit_with_tuple_and_dict_dataset_inputs(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
+                                                     '/device:CPU:0'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 3))
+      output_d_np = np.random.random((10, 4))
+      output_e_np = np.random.random((10, 4))
+
+      # Test with tuples
+      dataset_tuple = dataset_ops.Dataset.from_tensor_slices((
+          (input_a_np, input_b_np), (output_d_np, output_e_np)))
+      dataset_tuple = dataset_tuple.repeat(100)
+      dataset_tuple = dataset_tuple.batch(10)
+
+      model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+
+      # Test with dict
+      dataset_dict = dataset_ops.Dataset.from_tensor_slices((
+          {'input_a': input_a_np, 'input_b': input_b_np},
+          (output_d_np, output_e_np)))
+      dataset_dict = dataset_dict.repeat(100)
+      dataset_dict = dataset_dict.batch(10)
+
+      model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+
   def test_fit_eval_and_predict_methods_on_dataset(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 2f3d6bd..8163494 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -68,6 +68,8 @@
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
+# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
+# TowerLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index aeeb955..516ede7 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -25,11 +25,13 @@
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
-from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -43,32 +45,60 @@
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + combinations.combine(mode=["eager"], use_callable_loss=[True]),
-          combinations.combine(is_tpu=[False])) + combinations.combine(
-              distribution=[combinations.tpu_strategy],
-              optimizer_fn=[
-                  combinations.adam_optimizer_v1_fn,
-                  # TODO(isaprykin):  Make Adam v2 work with while_loops
-                  # and TPUs.
-              ],
-              mode=["graph"],
-              use_callable_loss=[False],
-              is_tpu=[True]))
-  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
-                       is_tpu):
-    # TODO(priyag): Remove this once the step TPU Strategy is stable.
-    if is_tpu:
-      self.skipTest("TPU tests are WIP.")
-
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=combinations.optimizers_v1,
+          mode=["graph"],
+          use_callable_loss=[True, False]))
+  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss):
     with distribution.scope():
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
-      # `DistributionStrategy.create_monitor` so that each DistributionStrategy
-      # could influence its training loop. That method would return an instance
-      # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
-      # tpu.shutdown_system().
+      def step_fn(ctx, *inputs):
+        del ctx  # Unused
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, *inputs, run_concurrently=layer.built))
+
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=2).run_op
+
+      self.evaluate(distribution.initialize())
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(5):
+        run_step()
+
+        weights.append(self.evaluate(layer.kernel))
+        biases.append(self.evaluate(layer.bias))
+
+      self.evaluate(distribution.finalize())
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetworkByCallForEachTower(self, distribution, optimizer_fn,
+                                         use_callable_loss):
+    with distribution.scope():
+      model_fn, dataset_fn, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
 
@@ -79,8 +109,6 @@
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
-          if is_tpu:
-            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
@@ -91,10 +119,6 @@
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      if is_tpu:
-        with self.test_session() as sess:
-          sess.run(tpu.shutdown_system())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
@@ -103,22 +127,12 @@
       combinations.times(
           combinations.distributions_and_v1_optimizers() +
           combinations.distributions_and_v2_optimizers(),
-          combinations.combine(mode=["graph", "eager"], is_tpu=[False])) +
+          combinations.combine(mode=["graph", "eager"])) +
       combinations.combine(
           distribution=[combinations.tpu_strategy],
-          optimizer_fn=[
-              combinations.adam_optimizer_v1_fn,
-              combinations.gradient_descent_optimizer_v1_fn,
-              combinations.gradient_descent_optimizer_v2_fn,
-          ],
-          mode=["graph"],
-          is_tpu=[True]))
-
-  def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
-    # TODO(priyag): Remove this once the step TPU Strategy is stable.
-    if is_tpu:
-      self.skipTest("TPU tests are WIP.")
-
+          optimizer_fn=combinations.optimizers_v1+combinations.optimizers_v2,
+          mode=["graph"]))
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
     created_variables = []
     trainable_variables = []
 
@@ -139,26 +153,28 @@
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
+      def step_fn(ctx, *inputs):
+        del ctx  # Unused
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, *inputs, run_concurrently=layer.built))
+
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, iterator.get_next(), run_concurrently=layer.built))
+        return distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=1).run_op
 
+      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.test_session() as sess:
-          if is_tpu:
-            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
-        self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
-      if is_tpu:
-        with self.test_session() as sess:
-          sess.run(tpu.shutdown_system())
+      self.evaluate(distribution.finalize())
 
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
@@ -189,27 +205,17 @@
               combinations.distributions_and_v1_optimizers(),
               combinations.combine(
                   mode=["graph", "eager"],
-                  is_tpu=[False],
                   # TODO(isaprykin):  Allow False here.  Currently subsequent
                   # towers will re-execute UPDATE_OPS of previous towers.
                   update_ops_in_cross_tower_mode=[True])) +
           combinations.combine(
               distribution=[combinations.tpu_strategy],
-              optimizer_fn=[
-                  combinations.gradient_descent_optimizer_v1_fn,
-                  combinations.gradient_descent_optimizer_v2_fn
-              ],
+              optimizer_fn=combinations.optimizers_v1,
               mode=["graph"],
-              is_tpu=[True],
               update_ops_in_cross_tower_mode=[False])))
   def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
-                                    renorm, is_tpu,
-                                    update_ops_in_cross_tower_mode):
+                                    renorm, update_ops_in_cross_tower_mode):
     """Verifies that moving mean updates are reduced across towers."""
-    # TODO(priyag): Remove this once the step TPU Strategy is stable.
-    if is_tpu:
-      self.skipTest("TPU tests are WIP.")
-
     with distribution.scope():
       num_towers = len(distribution.worker_devices)
       model_fn, dataset_fn, batchnorm = batchnorm_example(
@@ -224,24 +230,28 @@
       # this test relies on specific input being on each device.
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         self.assertFalse(distribution._prefetch_on_device)
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
 
-      def run_step():
+      def step_fn(ctx, *inputs):
+        del ctx  # Unused
         fetches = distribution.unwrap(
             distribution.call_for_each_tower(
-                model_fn, iterator.get_next(),
-                run_concurrently=batchnorm.built))
+                model_fn, *inputs, run_concurrently=batchnorm.built))
         if update_ops_in_cross_tower_mode:
           fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         return control_flow_ops.group(fetches)
 
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        return distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=1).run_op
+
+      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.test_session() as sess:
-          if is_tpu:
-            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
-        self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
 
       expected_moving_means = [0.] * 8
 
@@ -263,9 +273,7 @@
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-      if is_tpu:
-        with self.test_session() as sess:
-          sess.run(tpu.shutdown_system())
+      self.evaluate(distribution.finalize())
 
   @combinations.generate(
       combinations.times(
@@ -285,22 +293,16 @@
                       combinations.one_device_strategy,
                       combinations.mirrored_strategy_with_gpu_and_cpu,
                       combinations.mirrored_strategy_with_two_gpus
-                  ],
-                  is_tpu=[False]),
+                  ]),
               combinations.combine(
                   mode=["graph"], use_callable_loss=[True, False]) +
               combinations.combine(mode=["eager"], use_callable_loss=[True])) +
           combinations.combine(
               distribution=[combinations.tpu_strategy],
-              is_tpu=[True],
               mode=["graph"],
               use_callable_loss=[True, False])))
   def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
-                    use_callable_loss, is_tpu):
-    # TODO(priyag): Remove this once the step TPU Strategy is stable.
-    if is_tpu:
-      self.skipTest("TPU tests are WIP.")
-
+                    use_callable_loss):
     with distribution.scope():
       all_vars = []
 
@@ -326,20 +328,24 @@
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
+      def step_fn(ctx, x, y):
+        del ctx  # Unused
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, x, y, run_concurrently=False))
+
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, *iterator.get_next(), run_concurrently=False))
+        return distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=1).run_op
 
+      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.test_session() as sess:
-          if is_tpu:
-            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
-        self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
@@ -369,10 +375,132 @@
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
-      if is_tpu:
-        with self.test_session() as sess:
-          sess.run(tpu.shutdown_system())
+      self.evaluate(distribution.finalize())
 
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=["graph", "eager"]),
+          combinations.combine(is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=combinations.optimizers_v1,
+          mode=["graph"],
+          is_tpu=[True]))
+  def testRunStepsWithOutputContext(self, distribution, optimizer_fn, is_tpu):
+    with distribution.scope():
+      def dataset_fn():
+        dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+        # TODO(priyag): batch with drop_remainder=True causes shapes to be
+        # fully defined for TPU. Remove this when XLA supports dynamic shapes.
+        return dataset.batch(batch_size=1, drop_remainder=True)
+
+      optimizer = optimizer_fn()
+      layer = core.Dense(1, use_bias=True)
+
+      key1 = "foo"
+      value1 = "bar"
+
+      def model_fn(output_context, x):
+        """A very simple model written by the user."""
+        def loss_fn():
+          y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+          return y * y
+
+        train_op = optimizer.minimize(loss_fn)
+        loss = loss_fn()
+        output_context.set_last_step_output(
+            name="tower_loss_agg",
+            output=loss,
+            aggregation=variables_lib.VariableAggregation.MEAN)
+        output_context.set_non_tensor_output(key1, value1)
+        return (train_op, loss)
+
+      def step_fn(output_context, *inputs):
+        (train_op, loss) = distribution.call_for_each_tower(
+            model_fn, output_context, *inputs, run_concurrently=False)
+        output_context.set_last_step_output(
+            name="cross_tower_loss_agg",
+            output=loss,
+            aggregation=variables_lib.VariableAggregation.MEAN)
+        output_context.set_last_step_output(
+            name="cross_tower_loss_noagg",
+            output=loss)
+        return distribution.group(train_op)
+
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+
+      def run_step():
+        initial_loss = lambda: constant_op.constant(1e7)
+        # Initial values corresponding to aggregated losses are just single
+        # tensors. But for non aggregated losses, we need to have initial
+        # values that are of the same structure as non reduced losses. In
+        # MirroredStrategy, this will be a list of losses, in TPUStrategy
+        # it will be single tensor. Using `broadcast` followed by `unwrap`
+        # gives us the desired initial value structure.
+        initial_loop_values = {
+            "tower_loss_agg": initial_loss(),
+            "cross_tower_loss_agg": initial_loss(),
+            "cross_tower_loss_noagg":
+            distribution.unwrap(distribution.broadcast(initial_loss()))
+        }
+        ctx = distribution.run_steps_on_dataset(
+            step_fn, iterator, iterations=2,
+            initial_loop_values=initial_loop_values)
+
+        self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
+        self._verify_loss_output(
+            initial_loss(),
+            loss_output=ctx.last_step_outputs["tower_loss_agg"],
+            aggregated=True, distribution=distribution)
+        self._verify_loss_output(
+            initial_loss(),
+            loss_output=ctx.last_step_outputs["cross_tower_loss_agg"],
+            aggregated=True, distribution=distribution)
+        self._verify_loss_output(
+            initial_loss(),
+            loss_output=ctx.last_step_outputs["cross_tower_loss_noagg"],
+            aggregated=False, distribution=distribution)
+        return (ctx.run_op, ctx.last_step_outputs["tower_loss_agg"])
+
+      self.evaluate(distribution.initialize())
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+      self.evaluate(variables_lib.global_variables_initializer())
+
+      weights, biases, losses = [], [], []
+      for _ in range(5):
+        _, loss = run_step()
+        losses.append(loss)
+        weights.append(self.evaluate(layer.kernel))
+        biases.append(self.evaluate(layer.bias))
+
+      self.evaluate(distribution.finalize())
+
+      loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:]))
+      self.assertTrue(loss_is_not_increasing)
+
+      error = abs(
+          numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(error_is_not_increasing)
+
+  def _verify_loss_output(self, initial_loss, loss_output, aggregated,
+                          distribution):
+    if not aggregated:
+      self.assertEqual(distribution.num_towers,
+                       len(distribution.unwrap(loss_output)))
+      loss_output = distribution.reduce(
+          aggregation=variables_lib.VariableAggregation.MEAN,
+          value=loss_output, destinations="/device:CPU:0")
+
+    unwrapped_output = distribution.unwrap(loss_output)
+    self.assertEqual(1, len(unwrapped_output))
+    loss_tensor = unwrapped_output[0]
+    self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
+    self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index c5d6e97..6981449 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -19,21 +19,28 @@
 from __future__ import print_function
 
 import contextlib
+from functools import partial
 import threading
 
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import shared_variable_creator
 from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
 
 
 # TODO(josh11b): Replace asserts in this file with if ...: raise ...
@@ -287,24 +294,112 @@
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
-  """Mirrors vars to distribute across multiple devices on a single machine.
+  """Mirrors vars to distribute across multiple devices and machines.
 
-  This strategy uses one tower per device and sync replication.
+  This strategy uses one tower per device and sync replication for its multi-GPU
+  version.
+
+  When `cluster_spec` is given, it turns into the mulit-worker version that
+  works on multiple workers with in-graph replication.
+
+  There are several important concepts for distributed TensorFlow, e.g.
+  `client`, `job`, 'task', `cluster`, `in-graph replication` and
+  'synchronous training' and they have already been defined in the
+  [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
+  The distribution strategy inherits these concepts as well and in addition to
+  that we also clarify several more concepts:
+    * **In-graph replication**: the `client` creates a single `tf.Graph` that
+    specifies tasks for devices on all workers. The `client` then creates a
+    client session which will talk to the `master` service of a `worker`. Then
+    the `master` will partition the graph and distribute the work to all
+    participating workers.
+    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
+    physical machine. We will have multiple `worker`s with different `task`
+    index. They all do similar things except for one worker checkpointing model
+    variables, writing summaries, etc. in addition to its ordinary work.
+
+  The multi-worker version of this class maps one tower to one device on a
+  worker. It mirrors all model variables on all towers. For example, if you have
+  two `worker`s and each `worker` has 4 GPUs, it will create 8 copies of the
+  model variables on these 8 GPUs. Then like in MirroredStrategy, each tower
+  performs their computation with their own copy of variables unless in
+  cross-tower model where variable or tensor reduction happens.
+
+  Args:
+    devices: a list of device strings.
+    num_gpus: number of GPUs. For local training, either specify `devices` or
+      `num_gpus`. In distributed training, this must be specified as number of
+      GPUs on each worker.
+    cluster_spec: if this is set, it turns into the multi-worker version and
+      `devices` must not be set but `num_gpus` must be set.
+    cross_tower_ops: optional, a descedant of `CrossTowerOps`. If this is not
+      set, the `configure` method will try to find the best one.
+    prefetch_on_device: optional boolean to specify whether to prefetch input
+      data to devices.
   """
 
   def __init__(self,
                devices=None,
                num_gpus=None,
+               cluster_spec=None,
                cross_tower_ops=None,
                prefetch_on_device=None):
     super(MirroredStrategy, self).__init__()
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
+
+    if cluster_spec:
+      if devices is not None:
+        raise ValueError("Specifying devices when `cluster_spec` is also given "
+                         "is not supported in MirroredStrategy.")
+
+      # TODO(yuefengz): use the utility method to normalize cluster_spec.
+      if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+        cluster_spec = server_lib.ClusterSpec(cluster_spec)
+      elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+        raise ValueError(
+            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+            "`tf.train.ClusterDef` object")
+      self._cluster_spec = cluster_spec
+
+      self._workers = []
+      for job in sorted(cluster_spec.jobs):
+        for task in range(cluster_spec.num_tasks(job)):
+          self._workers.append("/job:%s/task:%d" % (job, task))
+
       if num_gpus is None:
-        num_gpus = context.num_gpus()
-      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+        raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
+      self._num_gpus = num_gpus
+      if num_gpus > 0:
+        self._worker_device_map = {
+            worker: [
+                device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
+                for gpu in range(num_gpus)
+            ] for worker in self._workers
+        }
+      else:
+        self._worker_device_map = {
+            worker: [device_util.canonicalize(worker, "/device:CPU:0")]
+            for worker in self._workers
+        }
+      devices = nest.flatten(self._worker_device_map)
+
+      # Setting `_default_device` will add a device scope in the
+      # distribution.scope. We set the default device to the first worker. When
+      # users specify device under distribution.scope by
+      #   with tf.device("/cpu:0"):
+      #     ...
+      # their ops will end up on the cpu device of its first worker, e.g.
+      # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
+      self._default_device = self._workers[0]
+    else:
+      self._cluster_spec = None
+      # Convert `num_gpus` into `devices`, shouldn't specify both.
+      if devices is None:
+        if num_gpus is None:
+          num_gpus = context.num_gpus()
+        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+      elif num_gpus is not None:
+        raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+      # TODO(yuefengz): consider setting the default device.
 
     assert devices, "Must specify at least one device."
     assert len(set(devices)) == len(devices), (
@@ -313,10 +408,9 @@
     self._devices = [device_util.resolve(d) for d in devices]
     self._canonical_device_set = set(self._devices)
     self._device_index = values.PerDevice(
-        dict((d, i) for i, d in enumerate(devices)))
+        {d: i for i, d in enumerate(devices)})
     self._cross_tower_ops = cross_tower_ops
     self._prefetch_on_device = prefetch_on_device
-    # TODO(yuefengz): consider setting the default device.
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
@@ -353,9 +447,74 @@
                                      **kwargs)
 
   def distribute_dataset(self, dataset_fn):
-    return values.PerDeviceDataset(
-        self._call_dataset_fn(dataset_fn), self._devices,
-        self._prefetch_on_device)
+    if self._cluster_spec:
+      return values.MultiWorkerDataset(
+          partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
+          self._prefetch_on_device)
+    else:
+      return values.PerDeviceDataset(
+          self._call_dataset_fn(dataset_fn), self._devices,
+          self._prefetch_on_device)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  def _run_steps_on_dataset(self, fn, iterator, iterations,
+                            initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
+      for (name, output) in ctx.last_step_outputs.items():
+        # Convert all outputs to tensors, potentially from `DistributedValues`.
+        ctx.last_step_outputs[name] = self.unwrap(output)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been aggregated, wrap them in a Mirrored
+      # container, else in a PerDevice container.
+      if aggregation is variables_lib.VariableAggregation.NONE:
+        last_step_tensor_outputs_dict[name] = values.regroup(
+            {d: t for d, t in zip(self._devices, output)}, values.PerDevice)
+      else:
+        assert len(output) == 1
+        last_step_tensor_outputs_dict[name] = output[0]
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
 
   def _broadcast(self, tensor, destinations):
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
@@ -380,10 +539,19 @@
     # in addition to PerDevice data.
     return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
 
-  def configure(self, session_config=None):
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    del cluster_spec, task_type, task_id
     if self._cross_tower_ops is None:
-      self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-          self._devices, session_config=session_config)
+      if self._cluster_spec:
+        self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
+            self._workers, self._num_gpus)
+      else:
+        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
+            self._devices, session_config=session_config)
 
   def _get_cross_tower_ops(self):
     if self._cross_tower_ops is None:
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index e064cfe..9a4cc0a 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -40,7 +40,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 
 GPU_TEST = "test_gpu" in sys.argv[0]
@@ -164,7 +164,7 @@
       # This variable should be created only once across the threads because of
       # special variable_creator functions used by `dist.call_for_each_tower`.
       v = variable_scope.variable(1.0, name="foo")
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -181,7 +181,7 @@
 
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -201,7 +201,7 @@
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return vs
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -223,7 +223,7 @@
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return vs
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -245,7 +245,7 @@
 
     def model_fn(device_id):
       v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -268,7 +268,8 @@
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
@@ -300,7 +301,8 @@
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -343,7 +345,8 @@
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -453,7 +456,7 @@
 
     def model_fn():
       v = variable_scope.variable(1.0, name="foo")
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -470,7 +473,7 @@
 
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -570,7 +573,8 @@
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
@@ -591,7 +595,8 @@
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
@@ -619,7 +624,8 @@
     def model_fn():
       b = variable_scope.variable(1.0, name="b")
       with ops.name_scope("foo"):
-        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+        c = distribution_strategy_context.get_tower_context().merge_call(
+            in_cross_tower)
       return b, c
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -651,7 +657,8 @@
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+        c = distribution_strategy_context.get_tower_context().merge_call(
+            in_cross_tower)
       return b, c
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -833,8 +840,9 @@
       self.assertEquals(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
@@ -898,8 +906,9 @@
       self.assertEquals(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
@@ -963,8 +972,9 @@
       self.assertEquals(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
index a066adf..55d59ad 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -19,12 +19,16 @@
 from __future__ import print_function
 
 from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import server_lib
 
 
 class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
@@ -68,7 +72,8 @@
         v = variable_scope.variable(1.0)
 
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
       return v
 
     def main_thread_creator(next_creator, *args, **kwargs):
@@ -85,5 +90,33 @@
       self.assertEquals(expected, result)
 
 
+class MultiWorkerMirroredStrategyTest(
+    multi_worker_test_base.MultiWorkerTestBase,
+    strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return mirrored_strategy.MirroredStrategy(
+        cluster_spec=server_lib.ClusterSpec({
+            'worker': ['/job:worker/task:0', '/job:worker/task:1']
+        }),
+        num_gpus=context.num_gpus())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testDeviceScope(self):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    with context.graph_mode():
+      strategy = mirrored_strategy.MirroredStrategy(
+          cluster_spec={'worker': ['/job:worker/task:0', '/job:worker/task:1']},
+          num_gpus=context.num_gpus())
+      with strategy.scope():
+        a = constant_op.constant(1.)
+        with ops.device('/cpu:0'):
+          b = constant_op.constant(1.)
+        self.assertEqual(a.device, '/job:worker/task:0')
+        self.assertEqual(b.device, '/job:worker/task:0/device:CPU:0')
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy.py b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
deleted file mode 100644
index cbfe5df..0000000
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes implementing a mirrored DistributionStrategy for multiple workers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-from tensorflow.contrib.distribute.python import values
-from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
-from tensorflow.core.protobuf import cluster_pb2
-from tensorflow.python.training import device_util
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import nest
-
-
-# TODO(yuefengz): support between-graph replication.
-# TODO(yuefengz): merge this class into its base class.
-# TODO(yuefengz): in some cases, we probably want to use configure method to
-# configure this class.
-# TODO(yuefengz): MirroredStrategy.worker_devices may be confusing after the
-# class is introduced.
-class MultiWorkerMirroredStrategy(MirroredStrategy):
-  """Mirrored strategy that works on multiple workers with in-graph replication.
-
-  There are several important concepts for distributed TensorFlow, e.g.
-  `client`, `job`, 'task', `cluster`, `in-graph replication` and
-  'synchronous training' and they have already been defined in the
-  [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
-  The distribution strategy inherits these concepts as well and in addition to
-  that we also clarify several more concepts:
-    * **In-graph replication**: the `client` creates a single `tf.Graph` that
-    specifies tasks for devices on all workers. The `client` then creates a
-    client session which will talk to the `master` service of a `worker`. Then
-    the `master` will partition the graph and distribute the work to all
-    participating workers.
-    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
-    physical machine. We will have multiple `worker`s with different `task`
-    index. They all do similar things except for one worker checkpointing model
-    variables, writing summaries, etc. in addition to its ordinary work.
-
-  This class maps one tower to one device on a worker. It mirrors all model
-  variables on all towers. For example, if you have two `worker`s and each
-  `worker` has 4 GPUs, it will create 8 copies of the model variables on these 8
-  GPUs. Then like in MirroredStrategy, each tower performs their computation
-  with their own copy of variables unless in cross-tower model where variable or
-  tensor reduction happens.
-  """
-
-  def __init__(self,
-               num_gpus_per_worker=1,
-               worker_job_name=None,
-               num_workers=None,
-               cluster=None,
-               cross_tower_ops=None,
-               prefetch_on_device=None):
-    """Initialize the strategy object.
-
-    Args:
-      num_gpus_per_worker: number of GPUs per work. If it is zero, the local
-        CPU will be used.
-      worker_job_name: the job name for `worker`, typically just 'worker'.
-      num_workers: the number of workers. If it is 0, it regenerates to
-        single-worker MirroredStrategy.
-      cluster: a `tf.train.ClusterSpec` object or a dict that can be used to
-        construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef`
-        proto buffer. It is an alternative way to initialize this object.
-      cross_tower_ops: the cross tower ops to use. If None, a default one will
-        be used. If configure method is called, a best one for the configuration
-        will be chosen.
-      prefetch_on_device: a boolean to specify whether to prefetech input to
-        each worker's devices.
-
-    Raises:
-      ValueError: if got an unexpected `cluster`.
-    """
-    if cluster is None:
-      self._workers = [
-          '/job:%s/task:%d' % (worker_job_name, task_index)
-          for task_index in range(num_workers)
-      ]
-    else:
-      if isinstance(cluster, (dict, cluster_pb2.ClusterDef)):
-        cluster_spec = server_lib.ClusterSpec(cluster)
-      elif isinstance(cluster, server_lib.ClusterSpec):
-        cluster_spec = cluster
-      else:
-        raise ValueError(
-            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-            '`tf.train.ClusterDef` object')
-
-      self._workers = []
-      for job in sorted(cluster_spec.jobs):
-        for task in range(cluster_spec.num_tasks(job)):
-          self._workers.append('/job:%s/task:%d' % (job, task))
-
-    self._num_gpus_per_worker = num_gpus_per_worker
-    if num_gpus_per_worker > 0:
-      self._worker_device_map = {
-          worker: [
-              device_util.canonicalize(worker + '/device:GPU:%d' % gpu)
-              for gpu in range(num_gpus_per_worker)
-          ] for worker in self._workers
-      }
-    else:
-      self._worker_device_map = {
-          worker: [device_util.canonicalize(worker, '/device:CPU:0')]
-          for worker in self._workers
-      }
-    self._devices = nest.flatten(self._worker_device_map)
-
-    super(MultiWorkerMirroredStrategy, self).__init__(
-        devices=self._devices, prefetch_on_device=prefetch_on_device)
-
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
-    self._default_device = self._workers[0]
-
-  def distribute_dataset(self, dataset_fn):
-    return values.MultiWorkerDataset(
-        partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-        self._prefetch_on_device)
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
deleted file mode 100644
index 09c859b..0000000
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MultiWorkerMirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import multi_worker_strategy
-from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.training import server_lib
-
-
-class MultiWorkerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
-                              strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster=server_lib.ClusterSpec({
-            'worker': ['/job:worker/task:0', '/job:worker/task:1']
-        }),
-        num_gpus_per_worker=context.num_gpus())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-
-class DeviceScopeTest(test.TestCase):
-  """Test the device scope of MultiWorkerMirroredStrategy."""
-
-  def testDeviceScope(self):
-    with context.graph_mode():
-      strategy = multi_worker_strategy.MultiWorkerMirroredStrategy(
-          cluster={'worker': ['/job:worker/task:0', '/job:worker/task:1']},
-          num_gpus_per_worker=context.num_gpus())
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device('/cpu:0'):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, '/job:worker/task:0')
-        self.assertEqual(b.device, '/job:worker/task:0/device:CPU:0')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index a7f2e2e..68561b5 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -21,11 +21,14 @@
 import six
 
 from tensorflow.contrib.distribute.python import values
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.util import nest
 
 
 # TODO(josh11b): Replace asserts in this file with if ...: raise ...
@@ -66,6 +69,53 @@
   def _broadcast(self, tensor, destinations):
     return tensor
 
+  # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
+  def _run_steps_on_dataset(self, fn, iterator, iterations,
+                            initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    # TODO(priyag): Use max_iterations instead of an explicit counter.
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
   def _call_for_each_tower(self, fn, *args, **kwargs):
     # We don't run `fn` in multiple threads in OneDeviceStrategy.
     kwargs.pop("run_concurrently", None)
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index f2c7fd5..8041eb0 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,13 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import values
-from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,24 +29,12 @@
 from tensorflow.python.training import device_setter
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 _LOCAL_CPU = "/device:CPU:0"
 _LOCAL_GPU_0 = "/device:GPU:0"
 
 
-def _normalize_cluster_spec(cluster_spec):
-  """Makes `cluster_spec` into a `ClusterSpec` object."""
-  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
-    return server_lib.ClusterSpec(cluster_spec)
-  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
-    raise ValueError(
-        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-        "`tf.train.ClusterDef` object")
-  return cluster_spec
-
-
 # TODO(yuefengz): maybe cache variables on local CPU.
 # TODO(yuefengz): we may want to set session options to disallow communication
 # between workers.
@@ -77,16 +62,16 @@
   GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
   caution needs to be taken:
 
-  1) Always use @{tf.get_variable} instead of @{tf.Variable} which is not able
+  1) Always use `tf.get_variable` instead of `tf.Variable` which is not able
   to refer to the same variable on different towers.
 
   2) It is generally not recommended to open a device scope under the strategy's
-  scope. A device scope (i.e. calling @{tf.device}) will be merged with or
+  scope. A device scope (i.e. calling `tf.device`) will be merged with or
   override the device for operations but will not change the device for
   variables.
 
   3) It is also not recommended to open a colocation scope (i.e. calling
-  @{tf.colocate_with}) under the strategy's scope. For colocating variables,
+  `tf.colocate_with`) under the strategy's scope. For colocating variables,
   use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
   create conflicts of device assignement.
   """
@@ -108,7 +93,7 @@
     super(ParameterServerStrategy, self).__init__()
     self._num_gpus_per_worker = num_gpus_per_worker
     if cluster_spec:
-      cluster_spec = _normalize_cluster_spec(cluster_spec)
+      cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
     self._cluster_spec = cluster_spec
 
     # We typically don't need to do all-reduce in this strategy.
@@ -216,6 +201,9 @@
     else:
       self._default_device = self._worker_device
 
+    self._is_chief = cluster_spec is None or multi_worker_util.is_chief(
+        cluster_spec, task_type, task_id)
+
   def distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     return values.PerDeviceDataset(
@@ -319,26 +307,31 @@
     # No need to distinguish between normal variables and tower-local variables.
     return array_ops.identity(var)
 
-  def configure(self, session_config=None):
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the strategy class.
+
+    The strategy object will be re-initialized if `cluster_spec` is given but
+    was not passed in the constructor.
+
+    Args:
+      session_config: not used currently.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+    """
     del session_config
 
-    # Use TF_CONFIG to get the cluster spec and the current job.
-    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-    cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {}))
-
-    task_env = tf_config.get("task", {})
-    if task_env:
-      task_type = task_env.get("type", "worker")
-      task_id = int(task_env.get("index", "0"))
-    else:
-      task_type = "worker"
-      task_id = None
-
     # Set the devices if cluster_spec is defined in TF_CONFIG but not passed in
     # the constructor.
     if not self._cluster_spec and cluster_spec:
-      self._cluster_spec = cluster_spec
-      self._initialize_devices(self._num_gpus_per_worker, cluster_spec,
+      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
+          cluster_spec)
+      self._initialize_devices(self._num_gpus_per_worker, self._cluster_spec,
                                task_type, task_id)
 
   @property
@@ -356,3 +349,19 @@
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
+
+  @property
+  def between_graph(self):
+    return True
+
+  @property
+  def should_init(self):
+    return self._is_chief
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index cf29c0e..0df6571 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -18,7 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import json
 import threading
 from absl.testing import parameterized
 
@@ -37,7 +36,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 
 class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
@@ -69,19 +68,8 @@
     if not task_type:
       return distribution, ''
 
-    tf_config = {
-        'cluster': self._cluster_spec,
-        'task': {
-            'type': task_type,
-            'index': task_id
-        }
-    }
-    with self._lock:
-      # Accessing environment variables should be protected by locks because
-      # environment variables are shared by all threads.
-      with test.mock.patch.dict('os.environ',
-                                {'TF_CONFIG': json.dumps(tf_config)}):
-        distribution.configure()
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id)
     return distribution, self._workers[task_id].target
 
   def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
@@ -101,7 +89,8 @@
           last_part_device = 'device:CPU:0'
         else:
           last_part_device = (
-              'device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              'device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -192,14 +181,16 @@
           tower_compute_device = '/device:CPU:0'
         else:
           tower_compute_device = (
-              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              '/device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
         tower_compute_device = device_util.canonicalize(tower_compute_device)
 
         if 'CPU' in variable_device:
           tower_variable_device = '/device:CPU:0'
         else:
           tower_variable_device = (
-              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              '/device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
         tower_variable_device = device_util.canonicalize(tower_variable_device)
 
         a = constant_op.constant(1.0)
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
index 24cdc62..1ff60c0 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -35,7 +35,7 @@
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+  """A replacement for `tf.data.Iterator` that prefetches to another device.
 
   Args:
     input_dataset: The input dataset.
@@ -108,7 +108,7 @@
             self._input_dataset)
 
   def get_next(self, name=None):
-    """See @{tf.data.Iterator.get_next}."""
+    """See `tf.data.Iterator.get_next`."""
     self._get_next_call_count += 1
     if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
@@ -209,7 +209,7 @@
 def prefetch_to_devices(devices, buffer_size=None):
   """A transformation that prefetches dataset values to the given `devices`.
 
-  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  NOTE: Although the transformation creates a `tf.data.Dataset`, the
   transformation must be the final `Dataset` in the input pipeline.
 
   Args:
@@ -220,7 +220,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
     return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index d1fdb32..5aa19cf 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -29,7 +29,8 @@
 from tensorflow.python.ops import math_ops
 
 
-def single_loss_example(optimizer_fn, distribution, use_bias=False):
+def single_loss_example(optimizer_fn, distribution, use_bias=False,
+                        iterations_per_step=1):
   """Build a very simple network to use in tests and examples."""
 
   def dataset_fn():
@@ -38,12 +39,13 @@
   optimizer = optimizer_fn()
   layer = core.Dense(1, use_bias=use_bias)
 
-  def loss_fn(x):
+  def loss_fn(ctx, x):
+    del ctx
     y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
     return y * y
 
-  single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn,
-                                                    optimizer, distribution)
+  single_loss_step = step_fn.StandardSingleLossStep(
+      dataset_fn, loss_fn, optimizer, distribution, iterations_per_step)
 
   # Layer is returned for inspecting the kernels in tests.
   return single_loss_step, layer
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index d191062..1b5a4f6 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -34,15 +34,9 @@
 
   def __call__(self):
     """Perform one step of this training algorithm."""
-    return self.step(self.inputs())
-
-  def inputs(self):
-    """For the generating the input to be passed to `step()`."""
     raise NotImplementedError("must be implemented in descendants")
 
-  def step(self, inputs):
-    """Perform the main computation of this training algorithm."""
-    raise NotImplementedError("must be implemented in descendants")
+  # TODO(priyag): Add an method to access initialization and finalize ops.
 
 
 class StandardInputStep(Step):
@@ -54,12 +48,9 @@
   """
 
   def __init__(self, dataset_fn, distribution):
-    Step.__init__(self, distribution)
-    self._distributed_input = distribution.distribute_dataset(
-        dataset_fn).make_one_shot_iterator()
-
-  def inputs(self):
-    return self._distributed_input.get_next()
+    super(StandardInputStep, self).__init__(distribution)
+    self._distributed_input = distribution.distribute_dataset(dataset_fn)
+    self._iterator = self._distributed_input.make_one_shot_iterator()
 
 
 class StandardSingleLossStep(StandardInputStep):
@@ -69,8 +60,8 @@
 
   ```python
   ...
-  step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer)
-  step.initialize(distribution)
+  step = step_fn.StandardSingleLossStep(
+      dataset, loss_fn, optimizer, distribution)
 
   # Run a single training step on a given DistributionStrategy:
   step(distribution)
@@ -80,27 +71,43 @@
   Args:
     dataset_fn: a function that returns a tf.data Dataset that produces the
       input for the model.
-    loss_fn: a function that returns loss.
+    loss_fn: a function that takes a context and inputs as arguments. It returns
+      the loss for those inputs. `context` is an instance of
+      `values.MultiStepContext` that will be passed when `loss_fn` is run.
+      `context` can be used to specify the outputs to be returned from
+      `loss_fn`, among other things.
     optimizer: an optimizer that implements an update rule.
     distribution: a `DistributionStrategy` object.
   """
 
-  def __init__(self, dataset_fn, loss_fn, optimizer, distribution):
-    StandardInputStep.__init__(self, dataset_fn, distribution)
+  def __init__(self, dataset_fn, loss_fn, optimizer, distribution,
+               iterations_per_step=1):
+    super(StandardSingleLossStep, self).__init__(dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
     self._is_run_concurrently = False
+    self._iterations_per_step = iterations_per_step
 
-  def step(self, inputs):
+  def __call__(self):
     with self._distribution.scope():
-      gradients_fn = backprop.implicit_grad(self._loss_fn)
-      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+      def step_fn(ctx, *inputs):
+        """Function to run one iteration with one input."""
+        gradients_fn = backprop.implicit_grad(self._loss_fn)
+        gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
-      grads_and_vars = self.distribution.call_for_each_tower(
-          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
-      # If threads use layers, then we need to run the first step sequentially,
-      # so that layers.build() is not executed in parallel.  Otherwise, multiple
-      # sets of mirrored variables are going to be created.
-      self._is_run_concurrently = True
-      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
-          self.distribution, grads_and_vars)
+        grads_and_vars = self.distribution.call_for_each_tower(
+            gradients_fn,
+            ctx, *inputs,
+            run_concurrently=self._is_run_concurrently)
+        # If threads use layers, then we need to run the first step
+        # sequentially, so that layers.build() is not executed in parallel.
+        # Otherwise, multiple sets of mirrored variables are going to be
+        # created.
+        self._is_run_concurrently = True
+        return self._optimizer._distributed_apply(  # pylint: disable=protected-access
+            self.distribution, grads_and_vars)
+
+      # TODO(priyag): Return the outputs, context, etc as well.
+      ctx = self.distribution.run_steps_on_dataset(
+          step_fn, self._iterator, self._iterations_per_step)
+      return ctx.run_op
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 2ee94d8..8605ab1 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -33,12 +33,19 @@
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
-          combinations.combine(mode=combinations.graph_and_eager_modes)))
-  def testTrainNetwork(self, distribution, optimizer_fn):
+          combinations.combine(mode=combinations.graph_and_eager_modes),
+          combinations.combine(is_tpu=[False])) +
+      combinations.combine(
+          distribution=[combinations.tpu_strategy],
+          optimizer_fn=combinations.optimizers_v1,
+          mode=["graph"],
+          is_tpu=[True]))
+  def testTrainNetwork(self, distribution, optimizer_fn, is_tpu):
     with distribution.scope():
       single_loss_step, layer = single_loss_example(
-          optimizer_fn, distribution, use_bias=True)
+          optimizer_fn, distribution, use_bias=True, iterations_per_step=2)
 
+      self.evaluate(distribution.initialize())
       if context.executing_eagerly():
         run_step = single_loss_step
       else:
@@ -47,12 +54,14 @@
       self.evaluate(variables.global_variables_initializer())
 
       weights, biases = [], []
-      for _ in range(10):
+      for _ in range(5):
         run_step()
 
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
+      self.evaluate(distribution.finalize())
+
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index baed0eb..371b97b 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -28,7 +28,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer
 
 
@@ -45,7 +45,8 @@
 # Must be the argument to a distribution.call_for_each_tower() call, calls a
 # get_tower_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_raise_exception_fn)
+  distribution_strategy_context.get_tower_context().merge_call(
+      _raise_exception_fn)
 
 
 # Must be the argument to a get_tower_context().merge_call() call, calls
@@ -58,7 +59,7 @@
 # calls a get_tower_context().merge_call() that calls a
 # call_for_each_tower() that raises an exception.
 def _merge_call_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_call_raises_fn)
+  distribution_strategy_context.get_tower_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_tower_context().merge_call() call, calls
@@ -72,7 +73,8 @@
 # get_tower_context().merge_call() that calls a call_for_each_tower() that
 # calls a get_tower_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_call_merge_raises_fn)
+  distribution_strategy_context.get_tower_context().merge_call(
+      _call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -208,7 +210,7 @@
       expected_devices = [False] * len(d.worker_devices)
 
       def mark_devices_fn():
-        tower_id = distribute_lib.get_tower_context().tower_id
+        tower_id = distribution_strategy_context.get_tower_context().tower_id
         self.assertLess(tower_id, len(d.worker_devices))
         self.assertFalse(expected_devices[tower_id])
         expected_devices[tower_id] = True
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index f5497e0..77fc56d 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -26,34 +26,69 @@
 from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.util import nest
 
 
+def get_tpu_system_metadata(tpu_cluster_resolver):
+  """Retrieves TPU system metadata given a TPUClusterResolver."""
+  master = tpu_cluster_resolver.master()
+
+  # pylint: disable=protected-access
+  cluster_spec = tpu_cluster_resolver.cluster_spec()
+  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+  tpu_system_metadata = (
+      tpu_system_metadata_lib._query_tpu_system_metadata(
+          master,
+          cluster_def=cluster_def,
+          query_topology=True))
+
+  return tpu_system_metadata
+
+
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
-  def __init__(self, num_cores_per_host=2):
+  def __init__(self, tpu_cluster_resolver, steps_per_run):
+    """Initializes the TPUStrategy object.
+
+    Args:
+      tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
+          which provides information about the TPU cluster.
+      steps_per_run: Number of steps to run on device before returning to the
+          host. Note that this can have side-effects on performance, hooks,
+          metrics, summaries etc.
+          This parameter is only used when Distribution Strategy is used with
+          estimator or keras.
+    """
     # TODO(isaprykin): Generalize the defaults.  They are currently tailored for
     # the unit test.
     super(TPUStrategy, self).__init__('/device:CPU:0')
-    # TODO(isaprykin): Auto-detect number of cores and hosts.
-    self._num_cores_per_host = num_cores_per_host
+
+    self._tpu_cluster_resolver = tpu_cluster_resolver
+    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
+
     # TODO(priyag): This should not be hardcoded here.
     self._host = '/device:CPU:0'
+    # TODO(sourabhbajaj): Remove this once performance of running one step
+    # at a time is comparable to multiple steps.
+    self.steps_per_run = steps_per_run
 
   def distribute_dataset(self, dataset_fn):
     # TODO(priyag): Perhaps distribute across cores here.
     return self._call_dataset_fn(dataset_fn)
 
-  # TODO(priyag): Deal with OutOfRange errors.
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
   def _run_steps_on_dataset(self, fn, iterator, iterations,
@@ -72,7 +107,7 @@
       control_deps = []
       sharded_inputs = []
       with ops.device(self._host):
-        for _ in range(self._num_cores_per_host):
+        for _ in range(self.num_towers):
           # Use control dependencies to ensure a deterministic ordering.
           with ops.control_dependencies(control_deps):
             inputs = nest.flatten(iterator.get_next())
@@ -103,53 +138,103 @@
 
     # Wrap `fn` for repeat.
     if initial_loop_values is None:
-      initial_loop_values = []
-    ctx = values.MultiStepContext(initial_loop_values)
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+    ctx = values.MultiStepContext()
     def run_fn(*args, **kwargs):
       del args, kwargs
-      fn_result = fn(ctx, dequeue_fn())
-      if ctx.last_step_outputs is None:
-        ctx.last_step_outputs = []
-      with ops.control_dependencies([fn_result]):
-        return array_ops.identity(ctx.last_step_outputs)
+      fn_inputs = dequeue_fn()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      if flat_last_step_outputs:
+        with ops.control_dependencies([fn_result]):
+          return [array_ops.identity(f) for f in flat_last_step_outputs]
+      else:
+        return fn_result
 
     # TODO(sourabhbajaj): The input to while loop should be based on the output
     # type of the step_fn
     def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, [initial_loop_values])
+      return training_loop.repeat(iterations, run_fn, initial_loop_values)
 
-    replicate_inputs = [[]] * self._num_cores_per_host
-    outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
-    last_step_tensor_outputs = [list(x) for x in zip(*outputs)]
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop and TPU replicate context. This is useful in cases
+    # where we might need to exit these contexts and get back to the outer
+    # context to do some things, for e.g. create an op which should be
+    # evaluated only once at the end of the loop on the host. One such usage
+    # is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    # Take index [0] of last_step_tensor_outputs as we wrapped
-    # initial_loop_values in a list in the `repeat` call.
-    return (control_flow_ops.group(last_step_tensor_outputs, enqueue_ops),
-            last_step_tensor_outputs[0], ctx)
+    replicate_inputs = [[]] * self.num_towers
+    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    del self._outer_control_flow_context
+    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
+
+    # Filter out any ops from the outputs, typically this would be the case
+    # when there were no tensor outputs.
+    last_step_tensor_outputs = [x for x in replicate_outputs
+                                if not isinstance(x, ops.Operation)]
+
+    # Outputs are currently of the structure (grouped by device)
+    # [[output0_device0, output1_device0, output2_device0],
+    #  [output0_device1, output1_device1, output2_device1]]
+    # Convert this to the following structure instead: (grouped by output)
+    # [[output0_device0, output0_device1],
+    #  [output1_device0, output1_device1],
+    #  [output2_device0, output2_device1]]
+    last_step_tensor_outputs = [list(x) for x in zip(*last_step_tensor_outputs)]
+
+    # Convert replicate_outputs to the original dict structure of
+    # last_step_outputs.
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been aggregated, take the first value
+      # from the list as each value should be the same. Else return the full
+      # list of values.
+      if aggregation is not variables_lib.VariableAggregation.NONE:
+        # TODO(priyag): Should this return the element or a list with 1 element
+        last_step_tensor_outputs_dict[name] = output[0]
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+
+    return ctx
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     kwargs.pop('run_concurrently', None)
     with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
       return fn(*args, **kwargs)
 
-  def get_initialization_ops(self):
-    return [tpu.initialize_system()]
+  def initialize(self):
+    if context.executing_eagerly():
+      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
+      raise NotImplementedError('Eager mode not supported in TPUStrategy.')
+    else:
+      return [tpu.initialize_system()]
 
-  def get_finalize_ops(self):
-    return [tpu.shutdown_system()]
+  def finalize(self):
+    if context.executing_eagerly():
+      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
+      raise NotImplementedError('Eager mode not supported in TPUStrategy.')
+    else:
+      return [tpu.shutdown_system()]
 
   def _reduce(self, aggregation, value, destinations):
     graph = ops.get_default_graph()
-    context = graph._get_control_flow_context()  # pylint: disable=protected-access
+    cf_context = graph._get_control_flow_context()  # pylint: disable=protected-access
     # If we're inside the ReplicateContext, reduction should be done using
     # CrossReplicaSum while outside we can directly use an add_n op.
-    while context:
-      if isinstance(context, tpu.TPUReplicateContext):
+    while cf_context:
+      if isinstance(cf_context, tpu.TPUReplicateContext):
         if aggregation == vs.VariableAggregation.MEAN:
           # TODO(jhseu):  Revisit once we support model-parallelism.
-          value *= (1. / self._num_cores_per_host)
+          value *= (1. / self.num_towers)
         return tpu_ops.cross_replica_sum(value)
-      context = context.outer_context
+      cf_context = cf_context.outer_context
 
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
@@ -166,6 +251,11 @@
       return output * (1. / len(value))
     return output
 
+  def _unwrap(self, value):
+    if isinstance(value, list):
+      return value
+    return [value]
+
   @property
   def num_towers(self):
-    return self._num_cores_per_host
+    return self._tpu_metadata.num_of_cores_per_host
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 6f34dd4..8548a86 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -35,8 +35,10 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -55,7 +57,7 @@
   def get(self, device=None):
     """Returns the value for the current device or raises a ValueError."""
     if device is None:
-      tower_context = distribute_lib.get_tower_context()
+      tower_context = distribution_strategy_context.get_tower_context()
       if tower_context:
         device = tower_context.device
       else:
@@ -288,14 +290,15 @@
     # We want cross-tower code that does some var.op.X calls
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-tower context to fail.
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return DistributedVarOp(self._primary_var.op.name,
                               self._primary_var.op.graph,
                               self._primary_var.op.type)
     return self.get().op
 
   def read_value(self):
-    return distribute_lib.get_distribution_strategy().read_var(self)
+    return distribution_strategy_context.get_distribution_strategy().read_var(
+        self)
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -361,7 +364,7 @@
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
     f = kwargs.pop("f")
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       update_device = distribute_lib.get_update_device()
       # We are calling update on the mirrored variable in cross tower context.
       if update_device is not None:
@@ -370,7 +373,7 @@
         v = self.get(device=update_device)
         return f(v, *args, **kwargs)
 
-      return distribute_lib.get_distribution_strategy().update(
+      return distribution_strategy_context.get_distribution_strategy().update(
           self, f, *args, **kwargs)
     else:
       _assert_tower_context()
@@ -391,8 +394,8 @@
                 aggregation=self._aggregation, value=value, destinations=self),
             *other_args, **other_kwargs)
 
-      return distribute_lib.get_tower_context().merge_call(merge_fn, *args,
-                                                           **kwargs)
+      return distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, *args, **kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -418,7 +421,7 @@
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return self._primary_var._as_graph_element()
     return self.get()._as_graph_element()
 
@@ -458,7 +461,7 @@
     # We use a callable so that we don't have to evaluate this expression
     # in the case where we are trying to restore instead of save.
     def tensor():
-      return distribute_lib.get_distribution_strategy().read_var(
+      return distribution_strategy_context.get_distribution_strategy().read_var(
           tower_local_variable)
     spec = saver.BaseSaverBuilder.SaveSpec(
         tensor=tensor,
@@ -474,7 +477,7 @@
 
 
 def _assert_tower_context():
-  if not distribute_lib.get_tower_context():
+  if not distribution_strategy_context.get_tower_context():
     raise RuntimeError(
         "Tower-local variables may only be assigned in a tower context.")
 
@@ -497,7 +500,7 @@
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       # To preserve the sum across save and restore, we have to divide the
       # total across all devices when restoring a variable that was summed
       # when saving.
@@ -525,7 +528,7 @@
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return self._get_cross_tower()
     return self.get()._as_graph_element()
 
@@ -934,67 +937,102 @@
 
   This context object is useful when running multiple steps at a time using the
   `run_steps_on_dataset` API. For e.g. it allows the user's step function to
-  specify which outputs to emit at what frequency. Currently it only supports
-  capturing output from the last step, but will soon be augmented to support
-  other use cases such as output each N steps.
+  specify which outputs to emit at what frequency. Currently it supports
+  capturing output from the last step, as well as capturing non tensor outputs.
+  In the future it will be augmented to support other use cases such as output
+  each N steps.
   """
 
-  def __init__(self, initial_loop_values=None):
+  def __init__(self):
     """Initializes an output context.
 
-    Args:
-      initial_loop_values: Initial values passed to the run steps
-        while loop. The only purpose is to verify the shapes and types
-        when the actual output is set. This will be removed once we
-        automatically infer the output shapes and types (and do not need to
-        check for user error in specifying them manually).
     Returns:
       A context object.
     """
-    self._last_step_outputs = None
-    self._non_tensor_outputs = None
-    self._initial_loop_values = initial_loop_values
+    self._last_step_outputs = {}
+    self._last_step_outputs_aggregations = {}
+    self._non_tensor_outputs = {}
 
   @property
   def last_step_outputs(self):
-    """Return the last step's outputs."""
+    """A dictionary consisting of outputs to be captured on last step.
+
+    Keys in the dictionary are names of tensors to be captured, as specified
+    when `set_last_step_output` is called.
+    Values in the dictionary are the tensors themselves. If
+    `set_last_step_output` was called with an `aggregation` for this output,
+    then the value is the aggregated value.
+
+    Returns:
+      A dictionary with last step outputs.
+    """
     return self._last_step_outputs
 
-  @last_step_outputs.setter
-  def last_step_outputs(self, outputs):
-    """Set the last step's outputs."""
-    self._verify_structure_shapes_types(outputs, self._initial_loop_values)
+  def _set_last_step_outputs(self, outputs):
+    """Replace the entire dictionary of last step outputs."""
+    if not isinstance(outputs, dict):
+      raise ValueError("Need a dictionary to set last_step_outputs.")
     self._last_step_outputs = outputs
 
+  def set_last_step_output(self, name, output,
+                           aggregation=variables_lib.VariableAggregation.NONE):
+    """Set `output` with `name` to be outputted from the last step.
+
+    Args:
+      name: String, name to identify the output. Doesn't need to match tensor
+        name.
+      output: The tensors that should be outputted with `name`. See below for
+        actual types supported.
+      aggregation: Aggregation method to use to aggregate outputs from multiple
+        towers. Required if `set_last_step_output` is called in a tower context.
+        Optional in cross_tower_context.
+        When present, the outputs from all the towers are aggregated using the
+        current distribution strategy's `reduce` method. Hence, the type of
+        `output` must be what's supported by the corresponding `reduce` method.
+        For e.g. if using MirroredStrategy and aggregation is set, output
+        must be a `PerDevice` value.
+        The aggregation method is also recorded in a dictionary
+        `_last_step_outputs_aggregations` for later interpreting of the
+        outputs as already reduced or not.
+
+    """
+    if distribution_strategy_context.get_cross_tower_context():
+      self._last_step_outputs_aggregations[name] = aggregation
+      if aggregation is variables_lib.VariableAggregation.NONE:
+        self._last_step_outputs[name] = output
+      else:
+        distribution = distribution_strategy_context.get_distribution_strategy()
+        self._last_step_outputs[name] = distribution.reduce(
+            aggregation, output, destinations="/device:CPU:0")
+    else:
+      assert aggregation is not variables_lib.VariableAggregation.NONE
+      def merge_fn(distribution, value):
+        self._last_step_outputs[name] = distribution.reduce(
+            aggregation, value, destinations="/device:CPU:0")
+        # Setting this inside the `merge_fn` because all towers share the same
+        # context object, so it's more robust to set it only once (even if all
+        # the towers are trying to set the same value).
+        self._last_step_outputs_aggregations[name] = aggregation
+
+      distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, output)
+
   @property
   def non_tensor_outputs(self):
-    """Return the non tensor outputs."""
+    """A dictionary consisting of any non tensor outputs to be captured."""
     return self._non_tensor_outputs
 
-  @non_tensor_outputs.setter
-  def non_tensor_outputs(self, outputs):
-    """Set any non tensor outputs."""
-    self._non_tensor_outputs = outputs
-
-  def _verify_structure_shapes_types(self, left, right):
-    """Verify that the structure, shapes and types of left are same as right."""
-    nest.assert_same_structure(left, right)
-    flat_left = nest.flatten(left)
-    flat_right = nest.flatten(right)
-    assert len(flat_left) == len(flat_right), (
-        "Length of left {} and right {} should be same.".
-        format(len(flat_left), len(flat_right)))
-
-    for o, i in zip(flat_left, flat_right):
-      # TODO(priyag): Add checks for other types like IndexedSlices.
-      if isinstance(o, ops.Tensor):
-        assert isinstance(i, ops.Tensor)
-        assert o.shape == i.shape, (
-            "Shape {} of left {} doesn't match shape {} of right {}.".
-            format(o.shape, o, i.shape, i))
-        assert o.dtype == i.dtype, (
-            "Dtype {} of left {} doesn't match dtype {} of right {}.".
-            format(o.dtype, o, i.dtype, i))
+  def set_non_tensor_output(self, name, output):
+    """Set `output` with `name` to be captured as a non tensor output."""
+    if distribution_strategy_context.get_cross_tower_context():
+      self._non_tensor_outputs[name] = output
+    else:
+      def merge_fn(distribution, value):
+        # NOTE(priyag): For non tensor outputs, we simply return all the values
+        # in a list as aggregation doesn't make sense on non tensors.
+        self._non_tensor_outputs[name] = distribution.unwrap(value)
+      distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, output)
 
 
 def value_container(val):
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index ad00d17..a8d0d49 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -124,7 +124,7 @@
 
 cuda_py_test(
     name = "conditional_distribution_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "python/kernel_tests/conditional_distribution_test.py",
         "python/kernel_tests/distribution_test.py",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
index 85d604e..49a9afe 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
@@ -29,6 +29,17 @@
 class MatrixInverseTriLBijectorTest(test.TestCase):
   """Tests the correctness of the Y = inv(tril) transformation."""
 
+  #The inverse of 0 is undefined, as the numbers above the main
+  #diagonal must be zero, we zero out these numbers after running inverse.
+  #See: https://github.com/numpy/numpy/issues/11445
+  def _inv(self, x):
+    y = np.linalg.inv(x)
+    #triu_indices only works on 2d arrays
+    #need to iterate over all the 2d arrays in a x-dimensional array.
+    for idx in np.ndindex(y.shape[0:-2]):
+      y[idx][np.triu_indices(y[idx].shape[-1], 1)] = 0
+    return y
+
   @test_util.run_in_graph_and_eager_modes
   def testComputesCorrectValues(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
@@ -98,7 +109,7 @@
                      [2., 3.]]],
                    [[[4., 0.],
                      [5., -6.]]]], dtype=np.float32)
-    x_inv_ = np.linalg.inv(x_)
+    x_inv_ = self._inv(x_)
     expected_fldj_ = -4. * np.sum(
         np.log(np.abs(np.diagonal(x_, axis1=-2, axis2=-1))), axis=-1)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
index 90910f3..200310b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
@@ -173,6 +173,13 @@
         self.assertAllClose(
             np.zeros(sample_shape_ + (2,)).astype(np.float32), sample_)
 
+  def testEntropy(self):
+    loc = np.array([-0.1, -3.2, 7.])
+    deterministic = deterministic_lib.Deterministic(loc=loc)
+    with self.test_session() as sess:
+      entropy_ = sess.run(deterministic.entropy())
+      self.assertAllEqual(np.zeros(3), entropy_)
+
 
 class VectorDeterministicTest(test.TestCase):
 
@@ -290,6 +297,13 @@
         self.assertAllClose(
             np.zeros(sample_shape_ + (2, 1)).astype(np.float32), sample_)
 
+  def testEntropy(self):
+    loc = np.array([[8.3, 1.2, 3.3], [-0.1, -3.2, 7.]])
+    deterministic = deterministic_lib.VectorDeterministic(loc=loc)
+    with self.test_session() as sess:
+      entropy_ = sess.run(deterministic.entropy())
+      self.assertAllEqual(np.zeros(2), entropy_)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index ad853ee..affc64a 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -152,6 +152,9 @@
     """Relative tolerance for comparing points to `self.loc`."""
     return self._rtol
 
+  def _entropy(self):
+    return array_ops.zeros(self.batch_shape_tensor(), dtype=self.dtype)
+
   def _mean(self):
     return array_ops.identity(self.loc)
 
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index f5aaa5c..aa680a9 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -134,7 +134,7 @@
     x_len = util.prefer_static_shape(x_rotated)[-1]
 
     # TODO(langmore) Investigate whether this zero padding helps or hurts.  At
-    # the moment is is necessary so that all FFT implementations work.
+    # the moment is necessary so that all FFT implementations work.
     # Zero pad to the next power of 2 greater than 2 * x_len, which equals
     # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
     x_len_float64 = math_ops.cast(x_len, np.float64)
@@ -198,7 +198,7 @@
     # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
     # other terms were zeros arising only due to zero padding.
     # `denominator = (N / 2 - m)` (defined below) is the proper term to
-    # divide by by to make this an unbiased estimate of the expectation
+    # divide by to make this an unbiased estimate of the expectation
     # E[X[n] Conj(X[n - m])].
     x_len = math_ops.cast(x_len, dtype.real_dtype)
     max_lags = math_ops.cast(max_lags, dtype.real_dtype)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 0cc764d..f793363 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -199,7 +199,7 @@
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -223,3 +223,17 @@
         "//tensorflow/python/eager:test",
     ],
 )
+
+py_test(
+    name = "remote_test",
+    srcs = ["remote_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:function",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 16844e0..135095a 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -28,7 +28,7 @@
   """An iterator producing tf.Tensor objects from a tf.data.Dataset.
 
   NOTE: Unlike the iterator created by the
-  @{tf.data.Dataset.make_one_shot_iterator} method, this class enables
+  `tf.data.Dataset.make_one_shot_iterator` method, this class enables
   additional experimental functionality, such as prefetching to the GPU.
   """
 
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
new file mode 100644
index 0000000..ca27a85
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -0,0 +1,649 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0TD5ZrvEMbhZ"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
+        "\n",
+        "# Convolutional VAE: An example with tf.keras and eager\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ITZuApL56Mny"
+      },
+      "source": [
+        "![evolution of output during training](https://tensorflow.org/images/autoencoders/cvae.gif)\n",
+        "\n",
+        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) by training a Variational Autoencoder. (VAE, [[1]](https://arxiv.org/abs/1312.6114), [[2]](https://arxiv.org/abs/1401.4082)).\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "P-JuIu2N_SQf"
+      },
+      "outputs": [],
+      "source": [
+        "# to generate gifs\n",
+        "!pip install imageio"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "e1_Y75QXJS6h"
+      },
+      "source": [
+        "## Import TensorFlow and enable Eager execution"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "YfIk2es3hJEd"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import, division, print_function\n",
+        "\n",
+        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
+        "import tensorflow as tf\n",
+        "tfe = tf.contrib.eager\n",
+        "tf.enable_eager_execution()\n",
+        "\n",
+        "import os\n",
+        "import time\n",
+        "import numpy as np\n",
+        "import glob\n",
+        "import matplotlib.pyplot as plt\n",
+        "import PIL\n",
+        "import imageio\n",
+        "from IPython import display"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iYn4MdZnKCey"
+      },
+      "source": [
+        "## Load the MNIST dataset\n",
+        "Each MNIST image is originally a vector of 784 integers, each of which is between 0-255 and represents the intensity of a pixel. We model each pixel with a Bernoulli distribution in our model, and we statically binarize the dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "a4fYMGxGhrna"
+      },
+      "outputs": [],
+      "source": [
+        "(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "NFC2ghIdiZYE"
+      },
+      "outputs": [],
+      "source": [
+        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
+        "test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')\n",
+        "\n",
+        "# Normalizing the images to the range of [0., 1.]\n",
+        "train_images /= 255.\n",
+        "test_images /= 255.\n",
+        "\n",
+        "# Binarization\n",
+        "train_images[train_images \u003e= .5] = 1.\n",
+        "train_images[train_images \u003c .5] = 0.\n",
+        "test_images[test_images \u003e= .5] = 1.\n",
+        "test_images[test_images \u003c .5] = 0."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "S4PIDhoDLbsZ"
+      },
+      "outputs": [],
+      "source": [
+        "TRAIN_BUF = 60000\n",
+        "BATCH_SIZE = 100\n",
+        "\n",
+        "TEST_BUF = 10000"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PIGN6ouoQxt3"
+      },
+      "source": [
+        "## Use *tf.data* to create batches and shuffle the dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "-yKCCQOoJ7cn"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE)\n",
+        "test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TEST_BUF).batch(BATCH_SIZE)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "THY-sZMiQ4UV"
+      },
+      "source": [
+        "## Wire up the generative and inference network with *tf.keras.Sequential*\n",
+        "\n",
+        "In our VAE example, we use two small ConvNets for the generative and inference network. Since these neural nets are small, we use `tf.keras.Sequential` to simplify our code. Let $x$ and $z$ denote the observation and latent variable respectively in the following descriptions. \n",
+        "\n",
+        "### Generative Network\n",
+        "This defines the generative model which takes a latent encoding as input, and outputs the parameters for a conditional distribution of the observation, i.e. $p(x|z)$. Additionally, we use a unit Gaussian prior $p(z)$ for the latent variable.\n",
+        "\n",
+        "### Inference Network\n",
+        "This defines an approximate posterior distribution $q(z|x)$, which takes as input an observation and outputs a set of parameters for the conditional distribution of the latent representation. In this example, we simply model this distribution as a diagonal Gaussian. In this case, the inference network outputs the mean and log-variance parameters of a factorized Gaussian (log-variance instead of the variance directly is for numerical stability).\n",
+        "\n",
+        "### Reparameterization Trick\n",
+        "During optimization, we can sample from $q(z|x)$ by first sampling from a unit Gaussian, and then multiplying by the standard deviation and adding the mean. This ensures the gradients could pass through the sample to the inference network parameters.\n",
+        "\n",
+        "### Network architecture\n",
+        "For the inference network, we use two convolutional layers followed by a fully-connected layer. In the generative network, we mirror this architecture by using a fully-connected layer followed by three convolution transpose layers (a.k.a. deconvolutional layers in some contexts). Note, it's common practice to avoid using batch normalization when training VAEs, since the additional stochasticity due to using mini-batches may aggravate instability on top of the stochasticity from sampling."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "VGLbvBEmjK0a"
+      },
+      "outputs": [],
+      "source": [
+        "class CVAE(tf.keras.Model):\n",
+        "  def __init__(self, latent_dim):\n",
+        "    super(CVAE, self).__init__()\n",
+        "    self.latent_dim = latent_dim\n",
+        "    self.inference_net = tf.keras.Sequential(\n",
+        "      [\n",
+        "          tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),\n",
+        "          tf.keras.layers.Conv2D(\n",
+        "              filters=32, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
+        "          tf.keras.layers.Conv2D(\n",
+        "              filters=64, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
+        "          tf.keras.layers.Flatten(),\n",
+        "          # No activation\n",
+        "          tf.keras.layers.Dense(latent_dim + latent_dim),\n",
+        "      ]\n",
+        "    )\n",
+        "\n",
+        "    self.generative_net = tf.keras.Sequential(\n",
+        "        [\n",
+        "          tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n",
+        "          tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),\n",
+        "          tf.keras.layers.Reshape(target_shape=(7, 7, 32)),\n",
+        "          tf.keras.layers.Conv2DTranspose(\n",
+        "              filters=64,\n",
+        "              kernel_size=3,\n",
+        "              strides=(2, 2),\n",
+        "              padding=\"SAME\",\n",
+        "              activation=tf.nn.relu),\n",
+        "          tf.keras.layers.Conv2DTranspose(\n",
+        "              filters=32,\n",
+        "              kernel_size=3,\n",
+        "              strides=(2, 2),\n",
+        "              padding=\"SAME\",\n",
+        "              activation=tf.nn.relu),\n",
+        "          # No activation\n",
+        "          tf.keras.layers.Conv2DTranspose(\n",
+        "              filters=1, kernel_size=3, strides=(1, 1), padding=\"SAME\"),\n",
+        "        ]\n",
+        "    )\n",
+        "\n",
+        "  def sample(self, eps=None):\n",
+        "    if eps is None:\n",
+        "      eps = tf.random_normal(shape=(100, self.latent_dim))\n",
+        "    return self.decode(eps, apply_sigmoid=True)\n",
+        "\n",
+        "  def encode(self, x):\n",
+        "    mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)\n",
+        "    return mean, logvar\n",
+        "\n",
+        "  def reparameterize(self, mean, logvar):\n",
+        "    eps = tf.random_normal(shape=mean.shape)\n",
+        "    return eps * tf.exp(logvar * .5) + mean\n",
+        "\n",
+        "  def decode(self, z, apply_sigmoid=False):\n",
+        "    logits = self.generative_net(z)\n",
+        "    if apply_sigmoid:\n",
+        "      probs = tf.sigmoid(logits)\n",
+        "      return probs\n",
+        "\n",
+        "    return logits"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0FMYgY_mPfTi"
+      },
+      "source": [
+        "## Define the loss function and the optimizer\n",
+        "\n",
+        "VAEs train by maximizing the evidence lower bound (ELBO) on the marginal log-likelihood:\n",
+        "\n",
+        "$$\\log p(x) \\ge \\text{ELBO} = \\mathbb{E}_{q(z|x)}\\left[\\log \\frac{p(x, z)}{q(z|x)}\\right].$$\n",
+        "\n",
+        "In practice, we optimize the single sample Monte Carlo estimate of this expectation:\n",
+        "\n",
+        "$$\\log p(x| z) + \\log p(z) - \\log q(z|x),$$\n",
+        "where $z$ is sampled from $q(z|x)$.\n",
+        "\n",
+        "**Note**: we could also analytically compute the KL term, but here we incorporate all three terms in the Monte Carlo estimator for simplicity."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "iWCn_PVdEJZ7"
+      },
+      "outputs": [],
+      "source": [
+        "def log_normal_pdf(sample, mean, logvar, raxis=1):\n",
+        "  log2pi = tf.log(2. * np.pi)\n",
+        "  return tf.reduce_sum(\n",
+        "      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n",
+        "      axis=raxis)\n",
+        "\n",
+        "def compute_loss(model, x):\n",
+        "  mean, logvar = model.encode(x)\n",
+        "  z = model.reparameterize(mean, logvar)\n",
+        "  x_logit = model.decode(z)\n",
+        "\n",
+        "  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n",
+        "  logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])\n",
+        "  logpz = log_normal_pdf(z, 0., 0.)\n",
+        "  logqz_x = log_normal_pdf(z, mean, logvar)\n",
+        "  return -tf.reduce_mean(logpx_z + logpz - logqz_x)\n",
+        "\n",
+        "def compute_gradients(model, x):\n",
+        "  with tf.GradientTape() as tape:\n",
+        "    loss = compute_loss(model, x)\n",
+        "  return tape.gradient(loss, model.trainable_variables), loss\n",
+        "\n",
+        "optimizer = tf.train.AdamOptimizer(1e-4)\n",
+        "def apply_gradients(optimizer, gradients, variables, global_step=None):\n",
+        "  optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Rw1fkAczTQYh"
+      },
+      "source": [
+        "## Training\n",
+        "\n",
+        "* We start by iterating over the dataset\n",
+        "* During each iteration, we pass the image to the encoder to obtain a set of mean and log-variance parameters of the approximate posterior $q(z|x)$\n",
+        "* We then apply the *reparameterization trick* to sample from $q(z|x)$\n",
+        "* Finally, we pass the reparameterized samples to the decoder to obtain the logits of the generative distribution $p(x|z)$\n",
+        "* **Note:** Since we use the dataset loaded by keras with 60k datapoints in the training set and 10k datapoints in the test set, our resulting ELBO on the test set is slightly higher than reported results in the literature which uses dynamic binarization of Larochelle's MNIST.\n",
+        "\n",
+        "## Generate Images\n",
+        "\n",
+        "* After training, it is time to generate some images\n",
+        "* We start by sampling a set of latent vectors from the unit Gaussian prior distribution $p(z)$\n",
+        "* The generator will then convert the latent sample $z$ to logits of the observation, giving a distribution $p(x|z)$\n",
+        "* Here we plot the probabilities of Bernoulli distributions\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "NS2GWywBbAWo"
+      },
+      "outputs": [],
+      "source": [
+        "epochs = 100\n",
+        "latent_dim = 50\n",
+        "num_examples_to_generate = 16\n",
+        "\n",
+        "# keeping the random vector constant for generation (prediction) so\n",
+        "# it will be easier to see the improvement.\n",
+        "random_vector_for_generation = tf.random_normal(\n",
+        "    shape=[num_examples_to_generate, latent_dim])\n",
+        "model = CVAE(latent_dim)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "RmdVsmvhPxyy"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_and_save_images(model, epoch, test_input):\n",
+        "  predictions = model.sample(test_input)\n",
+        "  fig = plt.figure(figsize=(4,4))\n",
+        "\n",
+        "  for i in range(predictions.shape[0]):\n",
+        "      plt.subplot(4, 4, i+1)\n",
+        "      plt.imshow(predictions[i, :, :, 0], cmap='gray')\n",
+        "      plt.axis('off')\n",
+        "\n",
+        "  # tight_layout minimizes the overlap between 2 sub-plots\n",
+        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
+        "  plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "2M7LmLtGEMQJ"
+      },
+      "outputs": [],
+      "source": [
+        "generate_and_save_images(model, 0, random_vector_for_generation)\n",
+        "\n",
+        "for epoch in range(1, epochs + 1):\n",
+        "  start_time = time.time()\n",
+        "  for train_x in train_dataset:\n",
+        "    gradients, loss = compute_gradients(model, train_x)\n",
+        "    apply_gradients(optimizer, gradients, model.trainable_variables)\n",
+        "  end_time = time.time()\n",
+        "\n",
+        "  if epoch % 1 == 0:\n",
+        "    loss = tfe.metrics.Mean()\n",
+        "    for test_x in test_dataset.make_one_shot_iterator():\n",
+        "      loss(compute_loss(model, test_x))\n",
+        "    elbo = -loss.result()\n",
+        "    display.clear_output(wait=False)\n",
+        "    print('Epoch: {}, Test set ELBO: {}, '\n",
+        "          'time elapse for current epoch {}'.format(epoch,\n",
+        "                                                    elbo,\n",
+        "                                                    end_time - start_time))\n",
+        "    generate_and_save_images(\n",
+        "        model, epoch, random_vector_for_generation)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "P4M_vIbUi7c0"
+      },
+      "source": [
+        "### Display an image using the epoch number"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "WfO5wCdclHGL"
+      },
+      "outputs": [],
+      "source": [
+        "def display_image(epoch_no):\n",
+        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "5x3q9_Oe5q0A"
+      },
+      "outputs": [],
+      "source": [
+        "display_image(epochs)  # Display images"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "NywiH3nL8guF"
+      },
+      "source": [
+        "### Generate a GIF of all the saved images."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "IGKQgENQ8lEI"
+      },
+      "outputs": [],
+      "source": [
+        "with imageio.get_writer('cvae.gif', mode='I') as writer:\n",
+        "  filenames = glob.glob('image*.png')\n",
+        "  filenames = sorted(filenames)\n",
+        "  last = -1\n",
+        "  for i,filename in enumerate(filenames):\n",
+        "    frame = 2*(i**0.5)\n",
+        "    if round(frame) \u003e round(last):\n",
+        "      last = frame\n",
+        "    else:\n",
+        "      continue\n",
+        "    image = imageio.imread(filename)\n",
+        "    writer.append_data(image)\n",
+        "  image = imageio.imread(filename)\n",
+        "  writer.append_data(image)\n",
+        "    \n",
+        "# this is a hack to display the gif inside the notebook\n",
+        "os.system('cp cvae.gif cvae.gif.png')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "uV0yiKpzNP1b"
+      },
+      "outputs": [],
+      "source": [
+        "display.Image(filename=\"cvae.gif.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "yQXO_dlXkKsT"
+      },
+      "source": [
+        "To downlod the animation from Colab uncomment the code below:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "4fSJS3m5HLFM"
+      },
+      "outputs": [],
+      "source": [
+        "#from google.colab import files\n",
+        "#files.download('cvae.gif')"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "cvae.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1eb0NOTQapkYs3X0v-zL1x5_LFKgDISnp",
+          "timestamp": 1527173385672
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 44ff43a..5621d6a 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -40,12 +40,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "u_2z-B3piVsw"
       },
@@ -69,12 +64,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "YfIk2es3hJEd"
       },
@@ -82,7 +72,7 @@
       "source": [
         "from __future__ import absolute_import, division, print_function\n",
         "\n",
-        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
+        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
         "\n",
@@ -112,12 +102,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "a4fYMGxGhrna"
       },
@@ -130,12 +115,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "NFC2ghIdiZYE"
       },
@@ -150,12 +130,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "S4PIDhoDLbsZ"
       },
@@ -179,12 +154,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "-yKCCQOoJ7cn"
       },
@@ -217,12 +187,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "VGLbvBEmjK0a"
       },
@@ -265,12 +230,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "bkOfJxk5j5Hi"
       },
@@ -299,12 +259,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "gDkA05NE6QMs"
       },
@@ -318,12 +273,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "k1HpMSLImuRi"
       },
@@ -360,12 +310,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "wkMNfBWlT-PV"
       },
@@ -388,12 +333,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "90BIcCKcDMxz"
       },
@@ -407,12 +347,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "iWCn_PVdEJZ7"
       },
@@ -426,6 +361,34 @@
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
+        "id": "mWtinsGDPJlV"
+      },
+      "source": [
+        "## Checkpoints (Object-based saving)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CA1w-7s2POEy"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint_dir = './training_checkpoints'\n",
+        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
+        "                                 discriminator_optimizer=discriminator_optimizer,\n",
+        "                                 generator=generator,\n",
+        "                                 discriminator=discriminator)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
         "id": "Rw1fkAczTQYh"
       },
       "source": [
@@ -449,12 +412,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "NS2GWywBbAWo"
       },
@@ -462,7 +420,7 @@
       "source": [
         "EPOCHS = 150\n",
         "noise_dim = 100\n",
-        "num_examples_to_generate = 100\n",
+        "num_examples_to_generate = 16\n",
         "\n",
         "# keeping the random vector constant for generation (prediction) so\n",
         "# it will be easier to see the improvement of the gan.\n",
@@ -474,12 +432,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "RmdVsmvhPxyy"
       },
@@ -490,15 +443,13 @@
         "  # don't want to train the batchnorm layer when doing inference.\n",
         "  predictions = model(test_input, training=False)\n",
         "\n",
-        "  fig = plt.figure(figsize=(10,10))\n",
+        "  fig = plt.figure(figsize=(4,4))\n",
         "  \n",
         "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(10, 10, i+1)\n",
+        "      plt.subplot(4, 4, i+1)\n",
         "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
         "      plt.axis('off')\n",
         "        \n",
-        "  # tight_layout minimizes the overlap between 2 sub-plots\n",
-        "  plt.tight_layout()\n",
         "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
         "  plt.show()"
       ]
@@ -507,12 +458,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "2M7LmLtGEMQJ"
       },
@@ -542,15 +488,20 @@
         "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))\n",
         "\n",
         "      \n",
-        "    if epoch % 10 == 0:\n",
+        "    if epoch % 1 == 0:\n",
         "      display.clear_output(wait=True)\n",
         "      generate_and_save_images(generator,\n",
         "                               epoch + 1,\n",
         "                               random_vector_for_generation)\n",
-        "\n",
+        "    \n",
+        "    # saving (checkpoint) the model every 15 epochs\n",
+        "    if (epoch + 1) % 15 == 0:\n",
+        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
+        "    \n",
         "    print ('Time taken for epoch {} is {} sec'.format(epoch + 1,\n",
         "                                                      time.time()-start))\n",
         "  # generating after the final epoch\n",
+        "  display.clear_output(wait=True)\n",
         "  generate_and_save_images(generator,\n",
         "                           epochs,\n",
         "                           random_vector_for_generation)"
@@ -560,12 +511,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "Ly3UN0SLLY2l"
       },
@@ -578,43 +524,55 @@
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
+        "id": "rfM4YcPVPkNO"
       },
       "source": [
-        "# Display an image using the epoch number"
+        "## Restore the latest checkpoint"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
+        "colab_type": "code",
+        "id": "XhXsd0srPo8c"
+      },
+      "outputs": [],
+      "source": [
+        "# restoring the latest checkpoint in checkpoint_dir\n",
+        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "P4M_vIbUi7c0"
+      },
+      "source": [
+        "## Display an image using the epoch number"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
         "colab_type": "code",
         "id": "WfO5wCdclHGL"
       },
       "outputs": [],
       "source": [
         "def display_image(epoch_no):\n",
-        "  plt.figure(figsize=(15,15))\n",
-        "  plt.imshow(np.array(PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))))\n",
-        "  plt.axis('off')"
+        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "5x3q9_Oe5q0A"
       },
@@ -647,12 +605,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "IGKQgENQ8lEI"
       },
@@ -661,23 +614,27 @@
         "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n",
         "  filenames = glob.glob('image*.png')\n",
         "  filenames = sorted(filenames)\n",
-        "  for filename in filenames:\n",
+        "  last = -1\n",
+        "  for i,filename in enumerate(filenames):\n",
+        "    frame = 2*(i**0.5)\n",
+        "    if round(frame) \u003e round(last):\n",
+        "      last = frame\n",
+        "    else:\n",
+        "      continue\n",
         "    image = imageio.imread(filename)\n",
         "    writer.append_data(image)\n",
-        "  # this is a hack to display the gif inside the notebook\n",
-        "  os.system('mv dcgan.gif dcgan.gif.png')"
+        "  image = imageio.imread(filename)\n",
+        "  writer.append_data(image)\n",
+        "    \n",
+        "# this is a hack to display the gif inside the notebook\n",
+        "os.system('cp dcgan.gif dcgan.gif.png')"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "uV0yiKpzNP1b"
       },
@@ -687,21 +644,27 @@
       ]
     },
     {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6EEG-wePkmJQ"
+      },
+      "source": [
+        "To downlod the animation from Colab uncomment the code below:"
+      ]
+    },
+    {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "4UJjSnIMOzOJ"
       },
       "outputs": [],
       "source": [
-        ""
+        "#from google.colab import files\n",
+        "#files.download('dcgan.gif')"
       ]
     }
   ],
@@ -709,7 +672,6 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "default_view": {},
       "name": "dcgan.ipynb",
       "private_outputs": true,
       "provenance": [
@@ -719,8 +681,7 @@
         }
       ],
       "toc_visible": true,
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
     },
     "kernelspec": {
       "display_name": "Python 3",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index b173f85..0270979 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -96,12 +96,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "wZ6LOM12wKGH"
       },
@@ -124,24 +119,20 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "yG_n40gFzf9s"
       },
       "outputs": [],
       "source": [
-        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
+        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "# Note: Once you enable eager execution, it cannot be disabled. \n",
         "tf.enable_eager_execution()\n",
         "\n",
         "import numpy as np\n",
+        "import os\n",
         "import re\n",
         "import random\n",
         "import unidecode\n",
@@ -165,12 +156,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "pD_55cOxLkAb"
       },
@@ -194,12 +180,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "-E5JvY3wzf94"
       },
@@ -224,12 +205,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "IalZLbvOzf-F"
       },
@@ -247,12 +223,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "1v_qUYfAzf-I"
       },
@@ -302,12 +273,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "0UHJDA39zf-O"
       },
@@ -341,19 +307,14 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "p2pGotuNzf-S"
       },
       "outputs": [],
       "source": [
         "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
-        "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
+        "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
       ]
     },
     {
@@ -376,12 +337,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "P3KTiiInzf-a"
       },
@@ -445,12 +401,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "7t2XrzEOzf-e"
       },
@@ -463,12 +414,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "dkjWIATszf-h"
       },
@@ -485,6 +431,32 @@
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
+        "id": "3K6s6F79P7za"
+      },
+      "source": [
+        "## Checkpoints (Object-based saving)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "oAGisDdfP9rL"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint_dir = './training_checkpoints'\n",
+        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
+        "                                 model=model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
         "id": "lPrP0XMUzf-p"
       },
       "source": [
@@ -514,12 +486,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "d4tSNwymzf-q"
       },
@@ -527,7 +494,7 @@
       "source": [
         "# Training step\n",
         "\n",
-        "EPOCHS = 30\n",
+        "EPOCHS = 20\n",
         "\n",
         "for epoch in range(EPOCHS):\n",
         "    start = time.time()\n",
@@ -547,13 +514,16 @@
         "              loss = loss_function(target, predictions)\n",
         "              \n",
         "          grads = tape.gradient(loss, model.variables)\n",
-        "          optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())\n",
+        "          optimizer.apply_gradients(zip(grads, model.variables))\n",
         "\n",
         "          if batch % 100 == 0:\n",
         "              print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,\n",
         "                                                            batch,\n",
         "                                                            loss))\n",
-        "    \n",
+        "    # saving (checkpoint) the model every 5 epochs\n",
+        "    if (epoch + 1) % 5 == 0:\n",
+        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
+        "\n",
         "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
       ]
@@ -562,6 +532,30 @@
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
+        "id": "01AR9vpNQMFF"
+      },
+      "source": [
+        "## Restore the latest checkpoint"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tyvpYomYQQkF"
+      },
+      "outputs": [],
+      "source": [
+        "# restoring the latest checkpoint in checkpoint_dir\n",
+        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
         "id": "DjGz1tDkzf-u"
       },
       "source": [
@@ -584,12 +578,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "WvuwZBX5Ogfd"
       },
@@ -651,12 +640,7 @@
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
+        "colab": {},
         "colab_type": "code",
         "id": "gtEd86sX5cB2"
       },
@@ -670,13 +654,11 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "default_view": {},
       "name": "text_generation.ipynb",
       "private_outputs": true,
       "provenance": [],
       "toc_visible": true,
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
     },
     "kernelspec": {
       "display_name": "Python 3",
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 1ab1b71..08d8364 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -1,39 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "nmt_with_attention.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [
-        {
-          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
-          "timestamp": 1527858391290
-        },
-        {
-          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
-          "timestamp": 1527776041613
-        }
-      ],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
-      "metadata": {
-        "id": "AOpGoE2T-YXS",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AOpGoE2T-YXS"
+      },
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -41,19 +13,19 @@
         "\n",
         "# Neural Machine Translation with Attention\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
-      "metadata": {
-        "id": "CiwtNgENbx2g",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CiwtNgENbx2g"
+      },
       "source": [
         "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
         "\n",
@@ -61,27 +33,24 @@
         "\n",
         "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
         "\n",
-        "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
+        "\u003cimg src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\"\u003e\n",
         "\n",
         "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
       ]
     },
     {
-      "metadata": {
-        "id": "tnxXKDjq3jEL",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tnxXKDjq3jEL"
+      },
+      "outputs": [],
       "source": [
         "from __future__ import absolute_import, division, print_function\n",
         "\n",
-        "# Import TensorFlow >= 1.9 and enable eager execution\n",
+        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "tf.enable_eager_execution()\n",
@@ -96,16 +65,14 @@
         "import time\n",
         "\n",
         "print(tf.__version__)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "wfodePkj3jEa",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wfodePkj3jEa"
+      },
       "source": [
         "## Download and prepare the dataset\n",
         "\n",
@@ -124,17 +91,14 @@
       ]
     },
     {
-      "metadata": {
-        "id": "kRVATYOgJs1b",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "kRVATYOgJs1b"
+      },
+      "outputs": [],
       "source": [
         "# Download the file\n",
         "path_to_zip = tf.keras.utils.get_file(\n",
@@ -142,22 +106,17 @@
         "    extract=True)\n",
         "\n",
         "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "rd0jw-eC3jEh",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "rd0jw-eC3jEh"
+      },
+      "outputs": [],
       "source": [
         "# Converts the unicode file to ascii\n",
         "def unicode_to_ascii(s):\n",
@@ -169,7 +128,7 @@
         "    w = unicode_to_ascii(w.lower().strip())\n",
         "    \n",
         "    # creating a space between a word and the punctuation following it\n",
-        "    # eg: \"he is a boy.\" => \"he is a boy .\" \n",
+        "    # eg: \"he is a boy.\" =\u003e \"he is a boy .\" \n",
         "    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
         "    w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
         "    w = re.sub(r'[\" \"]+', \" \", w)\n",
@@ -181,24 +140,19 @@
         "    \n",
         "    # adding a start and an end token to the sentence\n",
         "    # so that the model know when to start and stop predicting.\n",
-        "    w = '<start> ' + w + ' <end>'\n",
+        "    w = '\u003cstart\u003e ' + w + ' \u003cend\u003e'\n",
         "    return w"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "OHn4Dct23jEm",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "OHn4Dct23jEm"
+      },
+      "outputs": [],
       "source": [
         "# 1. Remove the accents\n",
         "# 2. Clean the sentences\n",
@@ -209,25 +163,20 @@
         "    word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
         "    \n",
         "    return word_pairs"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "9xbqO7Iie9bb",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9xbqO7Iie9bb"
+      },
+      "outputs": [],
       "source": [
-        "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
-        "# (e.g., 5 -> \"dad\") for each language,\n",
+        "# This class creates a word -\u003e index mapping (e.g,. \"dad\" -\u003e 5) and vice-versa \n",
+        "# (e.g., 5 -\u003e \"dad\") for each language,\n",
         "class LanguageIndex():\n",
         "  def __init__(self, lang):\n",
         "    self.lang = lang\n",
@@ -243,28 +192,23 @@
         "    \n",
         "    self.vocab = sorted(self.vocab)\n",
         "    \n",
-        "    self.word2idx['<pad>'] = 0\n",
+        "    self.word2idx['\u003cpad\u003e'] = 0\n",
         "    for index, word in enumerate(self.vocab):\n",
         "      self.word2idx[word] = index + 1\n",
         "    \n",
         "    for word, index in self.word2idx.items():\n",
         "      self.idx2word[index] = word"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "eAY9k49G3jE_",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "eAY9k49G3jE_"
+      },
+      "outputs": [],
       "source": [
         "def max_length(tensor):\n",
         "    return max(len(t) for t in tensor)\n",
@@ -300,86 +244,71 @@
         "                                                                  padding='post')\n",
         "    \n",
         "    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "GOi42V79Ydlr",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Limit the size of the dataset to experiment faster (optional)\n",
-        "\n",
-        "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "cnxC7q-j3jFD",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
+        "colab_type": "text",
+        "id": "GOi42V79Ydlr"
       },
+      "source": [
+        "### Limit the size of the dataset to experiment faster (optional)\n",
+        "\n",
+        "Training on the complete dataset of \u003e100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+      ]
+    },
+    {
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cnxC7q-j3jFD"
+      },
+      "outputs": [],
       "source": [
         "# Try experimenting with the size of that dataset\n",
         "num_examples = 30000\n",
         "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "4QILQkOs3jFG",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "4QILQkOs3jFG"
+      },
+      "outputs": [],
       "source": [
         "# Creating training and validation sets using an 80-20 split\n",
         "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
         "\n",
         "# Show length\n",
         "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "rgCLkfv5uO3d",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "rgCLkfv5uO3d"
+      },
       "source": [
         "### Create a tf.data dataset"
       ]
     },
     {
-      "metadata": {
-        "id": "TqHsArVZ3jFS",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "TqHsArVZ3jFS"
+      },
+      "outputs": [],
       "source": [
         "BUFFER_SIZE = len(input_tensor_train)\n",
         "BATCH_SIZE = 64\n",
@@ -390,30 +319,28 @@
         "vocab_tar_size = len(targ_lang.word2idx)\n",
         "\n",
         "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
-        "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+        "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
+      ]
     },
     {
-      "metadata": {
-        "id": "TNfHIF71ulLu",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TNfHIF71ulLu"
+      },
       "source": [
         "## Write the encoder and decoder model\n",
         "\n",
         "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
         "\n",
-        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
+        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\"\u003e\n",
         "\n",
         "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
         "\n",
         "Here are the equations that are implemented:\n",
         "\n",
-        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
-        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
+        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\"\u003e\n",
+        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\"\u003e\n",
         "\n",
         "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
         "\n",
@@ -435,17 +362,14 @@
       ]
     },
     {
-      "metadata": {
-        "id": "avyJ_4VIUoHb",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "avyJ_4VIUoHb"
+      },
+      "outputs": [],
       "source": [
         "def gru(units):\n",
         "  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
@@ -461,22 +385,17 @@
         "                               return_state=True, \n",
         "                               recurrent_activation='sigmoid', \n",
         "                               recurrent_initializer='glorot_uniform')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "nZ2rI24i3jFg",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "nZ2rI24i3jFg"
+      },
+      "outputs": [],
       "source": [
         "class Encoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
@@ -493,22 +412,17 @@
         "    \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.enc_units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "yJ_B3mhW3jFk",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yJ_B3mhW3jFk"
+      },
+      "outputs": [],
       "source": [
         "class Decoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
@@ -562,51 +476,41 @@
         "        \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.dec_units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "P5UY8wko3jFp",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "P5UY8wko3jFp"
+      },
+      "outputs": [],
       "source": [
         "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
         "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "_ch_71VbIRfK",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "_ch_71VbIRfK"
+      },
       "source": [
         "## Define the optimizer and the loss function"
       ]
     },
     {
-      "metadata": {
-        "id": "WmTHr5iV3jFr",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "WmTHr5iV3jFr"
+      },
+      "outputs": [],
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
@@ -615,16 +519,41 @@
         "  mask = 1 - np.equal(real, 0)\n",
         "  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
         "  return tf.reduce_mean(loss_)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "hpObfY22IddU",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "DMVWzzsfNl4e"
+      },
+      "source": [
+        "## Checkpoints (Object-based saving)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Zj8bXQTgNwrF"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint_dir = './training_checkpoints'\n",
+        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
+        "                                 encoder=encoder,\n",
+        "                                 decoder=decoder)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "hpObfY22IddU"
+      },
       "source": [
         "## Training\n",
         "\n",
@@ -638,17 +567,14 @@
       ]
     },
     {
-      "metadata": {
-        "id": "ddefjBMa3jF0",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ddefjBMa3jF0"
+      },
+      "outputs": [],
       "source": [
         "EPOCHS = 10\n",
         "\n",
@@ -666,7 +592,7 @@
         "            \n",
         "            dec_hidden = enc_hidden\n",
         "            \n",
-        "            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       \n",
+        "            dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']] * BATCH_SIZE, 1)       \n",
         "            \n",
         "            # Teacher forcing - feeding the target as the next input\n",
         "            for t in range(1, targ.shape[1]):\n",
@@ -686,26 +612,27 @@
         "        \n",
         "        gradients = tape.gradient(loss, variables)\n",
         "        \n",
-        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+        "        optimizer.apply_gradients(zip(gradients, variables))\n",
         "        \n",
         "        if batch % 100 == 0:\n",
         "            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n",
         "                                                         batch,\n",
         "                                                         batch_loss.numpy()))\n",
+        "    # saving (checkpoint) the model every 2 epochs\n",
+        "    if (epoch + 1) % 2 == 0:\n",
+        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
         "    \n",
         "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
         "                                        total_loss / N_BATCH))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "mU3Ce8M6I3rz",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "mU3Ce8M6I3rz"
+      },
       "source": [
         "## Translate\n",
         "\n",
@@ -717,17 +644,14 @@
       ]
     },
     {
-      "metadata": {
-        "id": "EbQpyYs13jF_",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "EbQpyYs13jF_"
+      },
+      "outputs": [],
       "source": [
         "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
@@ -744,7 +668,7 @@
         "    enc_out, enc_hidden = encoder(inputs, hidden)\n",
         "\n",
         "    dec_hidden = enc_hidden\n",
-        "    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
+        "    dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']], 0)\n",
         "\n",
         "    for t in range(max_length_targ):\n",
         "        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
@@ -757,29 +681,24 @@
         "\n",
         "        result += targ_lang.idx2word[predicted_id] + ' '\n",
         "\n",
-        "        if targ_lang.idx2word[predicted_id] == '<end>':\n",
+        "        if targ_lang.idx2word[predicted_id] == '\u003cend\u003e':\n",
         "            return result, sentence, attention_plot\n",
         "        \n",
         "        # the predicted ID is fed back into the model\n",
         "        dec_input = tf.expand_dims([predicted_id], 0)\n",
         "\n",
         "    return result, sentence, attention_plot"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "s5hQWlbN3jGF",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "s5hQWlbN3jGF"
+      },
+      "outputs": [],
       "source": [
         "# function for plotting the attention weights\n",
         "def plot_attention(attention, sentence, predicted_sentence):\n",
@@ -793,22 +712,17 @@
         "    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
         "\n",
         "    plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "sl9zUHzg3jGI",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "sl9zUHzg3jGI"
+      },
+      "outputs": [],
       "source": [
         "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
@@ -818,89 +732,91 @@
         "    \n",
         "    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
         "    plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "WrAM0FDomq3E",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
+        "colab_type": "text",
+        "id": "n250XbnjOaqP"
       },
+      "source": [
+        "## Restore the latest checkpoint and test"
+      ]
+    },
+    {
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "UJpT9D5_OgP6"
+      },
+      "outputs": [],
+      "source": [
+        "# restoring the latest checkpoint in checkpoint_dir\n",
+        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "WrAM0FDomq3E"
+      },
+      "outputs": [],
       "source": [
         "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "zSx2iM36EZQZ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "zSx2iM36EZQZ"
+      },
+      "outputs": [],
       "source": [
         "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "A3LLCx3ZE0Ls",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A3LLCx3ZE0Ls"
+      },
+      "outputs": [],
       "source": [
         "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "DUQVLVqUE1YW",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
       "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "DUQVLVqUE1YW"
+      },
+      "outputs": [],
       "source": [
         "# wrong translation\n",
         "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
-      "metadata": {
-        "id": "RTe5P5ioMJwN",
-        "colab_type": "text"
-      },
       "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "RTe5P5ioMJwN"
+      },
       "source": [
         "## Next steps\n",
         "\n",
@@ -908,5 +824,31 @@
         "* Experiment with training on a larger dataset, or using more epochs\n"
       ]
     }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "nmt_with_attention.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
+          "timestamp": 1527858391290
+        },
+        {
+          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
+          "timestamp": 1527776041613
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
new file mode 100644
index 0000000..ee25d25
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
@@ -0,0 +1,810 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0TD5ZrvEMbhZ"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
+        "\n",
+        "# Pix2Pix: An example with tf.keras and eager\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ITZuApL56Mny"
+      },
+      "source": [
+        "This notebook demonstrates image to image translation using conditional GAN's, as described in [Image-to-Image Translation with Conditional Adversarial Networks](https://arxiv.org/abs/1611.07004). Using this technique we can colorize black and white photos, convert google maps to google earth, etc. Here, we convert building facades to real buildings. We use [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) to achieve this.\n",
+        "\n",
+        "In example, we will use the [CMP Facade Database](http://cmp.felk.cvut.cz/~tylecr1/facade/), helpfully provided by the [Center for Machine Perception](http://cmp.felk.cvut.cz/) at the [Czech Technical University in Prague](https://www.cvut.cz/). To keep our example short, we will use a preprocessed [copy](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/) of this dataset, created by the authors of the [paper](https://arxiv.org/abs/1611.07004) above.\n",
+        "\n",
+        "Each epoch takes around 58 seconds on a single P100 GPU.\n",
+        "\n",
+        "Below is the output generated after training the model for 200 epochs.\n",
+        "\n",
+        "\n",
+        "![sample output_1](https://www.tensorflow.org/images/gan/pix2pix_1.png)\n",
+        "![sample output_2](https://www.tensorflow.org/images/gan/pix2pix_2.png)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "e1_Y75QXJS6h"
+      },
+      "source": [
+        "## Import TensorFlow and enable eager execution"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "YfIk2es3hJEd"
+      },
+      "outputs": [],
+      "source": [
+        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "\n",
+        "import os\n",
+        "import time\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "import PIL\n",
+        "from IPython.display import clear_output"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iYn4MdZnKCey"
+      },
+      "source": [
+        "## Load the dataset\n",
+        "\n",
+        "You can download this dataset and similar datasets from [here](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets). As mentioned in the [paper](https://arxiv.org/abs/1611.07004) we apply random jittering and mirroring to the training dataset.\n",
+        "* In random jittering, the image is resized to `286 x 286` and then randomly cropped to `256 x 256`\n",
+        "* In random mirroring, the image is randomly flipped horizontally i.e left to right."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "Kn-k8kTXuAlv"
+      },
+      "outputs": [],
+      "source": [
+        "path_to_zip = tf.keras.utils.get_file('facades.tar.gz',\n",
+        "                                      cache_subdir=os.path.abspath('.'),\n",
+        "                                      origin='https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/facades.tar.gz', \n",
+        "                                      extract=True)\n",
+        "\n",
+        "PATH = os.path.join(os.path.dirname(path_to_zip), 'facades/')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "2CbTEt448b4R"
+      },
+      "outputs": [],
+      "source": [
+        "BUFFER_SIZE = 400\n",
+        "BATCH_SIZE = 1\n",
+        "IMG_WIDTH = 256\n",
+        "IMG_HEIGHT = 256"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tyaP4hLJ8b4W"
+      },
+      "outputs": [],
+      "source": [
+        "def load_image(image_file, is_train):\n",
+        "  image = tf.read_file(image_file)\n",
+        "  image = tf.image.decode_jpeg(image)\n",
+        "\n",
+        "  w = tf.shape(image)[1]\n",
+        "\n",
+        "  w = w // 2\n",
+        "  real_image = image[:, :w, :]\n",
+        "  input_image = image[:, w:, :]\n",
+        "\n",
+        "  input_image = tf.cast(input_image, tf.float32)\n",
+        "  real_image = tf.cast(real_image, tf.float32)\n",
+        "\n",
+        "  if is_train:\n",
+        "    # random jittering\n",
+        "    \n",
+        "    # resizing to 286 x 286 x 3\n",
+        "    # method = 2 indicates using \"ResizeMethod.NEAREST_NEIGHBOR\"\n",
+        "    input_image = tf.image.resize_images(input_image, [286, 286], \n",
+        "                                         align_corners=True, method=2)\n",
+        "    real_image = tf.image.resize_images(real_image, [286, 286], \n",
+        "                                        align_corners=True, method=2)\n",
+        "    \n",
+        "    # randomly cropping to 256 x 256 x 3\n",
+        "    stacked_image = tf.stack([input_image, real_image], axis=0)\n",
+        "    cropped_image = tf.random_crop(stacked_image, size=[2, IMG_HEIGHT, IMG_WIDTH, 3])\n",
+        "    input_image, real_image = cropped_image[0], cropped_image[1]\n",
+        "\n",
+        "    if np.random.random() \u003e 0.5:\n",
+        "      # random mirroring\n",
+        "      input_image = tf.image.flip_left_right(input_image)\n",
+        "      real_image = tf.image.flip_left_right(real_image)\n",
+        "  else:\n",
+        "    input_image = tf.image.resize_images(input_image, size=[IMG_HEIGHT, IMG_WIDTH], \n",
+        "                                         align_corners=True, method=2)\n",
+        "    real_image = tf.image.resize_images(real_image, size=[IMG_HEIGHT, IMG_WIDTH], \n",
+        "                                        align_corners=True, method=2)\n",
+        "  \n",
+        "  # normalizing the images to [-1, 1]\n",
+        "  input_image = (input_image / 127.5) - 1\n",
+        "  real_image = (real_image / 127.5) - 1\n",
+        "\n",
+        "  return input_image, real_image"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PIGN6ouoQxt3"
+      },
+      "source": [
+        "## Use tf.data to create batches, map(do preprocessing) and shuffle the dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "SQHmYSmk8b4b"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset = tf.data.Dataset.list_files(PATH+'train/*.jpg')\n",
+        "train_dataset = train_dataset.shuffle(BUFFER_SIZE)\n",
+        "train_dataset = train_dataset.map(lambda x: load_image(x, True))\n",
+        "train_dataset = train_dataset.batch(1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "MS9J0yA58b4g"
+      },
+      "outputs": [],
+      "source": [
+        "test_dataset = tf.data.Dataset.list_files(PATH+'test/*.jpg')\n",
+        "test_dataset = test_dataset.map(lambda x: load_image(x, False))\n",
+        "test_dataset = test_dataset.batch(1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "THY-sZMiQ4UV"
+      },
+      "source": [
+        "## Write the generator and discriminator models\n",
+        "\n",
+        "* **Generator** \n",
+        "  * The architecture of generator is a modified U-Net.\n",
+        "  * Each block in the encoder is (Conv -\u003e Batchnorm -\u003e Leaky ReLU)\n",
+        "  * Each block in the decoder is (Transposed Conv -\u003e Batchnorm -\u003e Dropout(applied to the first 3 blocks) -\u003e ReLU)\n",
+        "  * There are skip connections between the encoder and decoder (as in U-Net).\n",
+        "  \n",
+        "* **Discriminator**\n",
+        "  * The Discriminator is a PatchGAN.\n",
+        "  * Each block in the discriminator is (Conv -\u003e BatchNorm -\u003e Leaky ReLU)\n",
+        "  * The shape of the output after the last layer is (batch_size, 30, 30, 1)\n",
+        "  * Each 30x30 patch of the output classifies a 70x70 portion of the input image (such an architecture is called a PatchGAN).\n",
+        "  * Discriminator receives 2 inputs.\n",
+        "    * Input image and the target image, which it should classify as real.\n",
+        "    * Input image and the generated image (output of generator), which it should classify as fake. \n",
+        "    * We concatenate these 2 inputs together in the code (`tf.concat([inp, tar], axis=-1)`)\n",
+        "\n",
+        "* Shape of the input travelling through the generator and the discriminator is in the comments in the code.\n",
+        "\n",
+        "To learn more about the architecture and the hyperparameters you can refer the [paper](https://arxiv.org/abs/1611.07004).\n",
+        "    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tqqvWxlw8b4l"
+      },
+      "outputs": [],
+      "source": [
+        "OUTPUT_CHANNELS = 3"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "lFPI4Nu-8b4q"
+      },
+      "outputs": [],
+      "source": [
+        "class Downsample(tf.keras.Model):\n",
+        "    \n",
+        "  def __init__(self, filters, size, apply_batchnorm=True):\n",
+        "    super(Downsample, self).__init__()\n",
+        "    self.apply_batchnorm = apply_batchnorm\n",
+        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
+        "\n",
+        "    self.conv1 = tf.keras.layers.Conv2D(filters, \n",
+        "                                        (size, size), \n",
+        "                                        strides=2, \n",
+        "                                        padding='same',\n",
+        "                                        kernel_initializer=initializer,\n",
+        "                                        use_bias=False)\n",
+        "    if self.apply_batchnorm:\n",
+        "        self.batchnorm = tf.keras.layers.BatchNormalization()\n",
+        "  \n",
+        "  def call(self, x, training):\n",
+        "    x = self.conv1(x)\n",
+        "    if self.apply_batchnorm:\n",
+        "        x = self.batchnorm(x, training=training)\n",
+        "    x = tf.nn.leaky_relu(x)\n",
+        "    return x \n",
+        "\n",
+        "\n",
+        "class Upsample(tf.keras.Model):\n",
+        "    \n",
+        "  def __init__(self, filters, size, apply_dropout=False):\n",
+        "    super(Upsample, self).__init__()\n",
+        "    self.apply_dropout = apply_dropout\n",
+        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
+        "\n",
+        "    self.up_conv = tf.keras.layers.Conv2DTranspose(filters, \n",
+        "                                                   (size, size), \n",
+        "                                                   strides=2, \n",
+        "                                                   padding='same',\n",
+        "                                                   kernel_initializer=initializer,\n",
+        "                                                   use_bias=False)\n",
+        "    self.batchnorm = tf.keras.layers.BatchNormalization()\n",
+        "    if self.apply_dropout:\n",
+        "        self.dropout = tf.keras.layers.Dropout(0.5)\n",
+        "\n",
+        "  def call(self, x1, x2, training):\n",
+        "    x = self.up_conv(x1)\n",
+        "    x = self.batchnorm(x, training=training)\n",
+        "    if self.apply_dropout:\n",
+        "        x = self.dropout(x, training=training)\n",
+        "    x = tf.nn.relu(x)\n",
+        "    x = tf.concat([x, x2], axis=-1)\n",
+        "    return x\n",
+        "\n",
+        "\n",
+        "class Generator(tf.keras.Model):\n",
+        "    \n",
+        "  def __init__(self):\n",
+        "    super(Generator, self).__init__()\n",
+        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
+        "    \n",
+        "    self.down1 = Downsample(64, 4, apply_batchnorm=False)\n",
+        "    self.down2 = Downsample(128, 4)\n",
+        "    self.down3 = Downsample(256, 4)\n",
+        "    self.down4 = Downsample(512, 4)\n",
+        "    self.down5 = Downsample(512, 4)\n",
+        "    self.down6 = Downsample(512, 4)\n",
+        "    self.down7 = Downsample(512, 4)\n",
+        "    self.down8 = Downsample(512, 4)\n",
+        "\n",
+        "    self.up1 = Upsample(512, 4, apply_dropout=True)\n",
+        "    self.up2 = Upsample(512, 4, apply_dropout=True)\n",
+        "    self.up3 = Upsample(512, 4, apply_dropout=True)\n",
+        "    self.up4 = Upsample(512, 4)\n",
+        "    self.up5 = Upsample(256, 4)\n",
+        "    self.up6 = Upsample(128, 4)\n",
+        "    self.up7 = Upsample(64, 4)\n",
+        "\n",
+        "    self.last = tf.keras.layers.Conv2DTranspose(OUTPUT_CHANNELS, \n",
+        "                                                (4, 4), \n",
+        "                                                strides=2, \n",
+        "                                                padding='same',\n",
+        "                                                kernel_initializer=initializer)\n",
+        "  \n",
+        "  @tf.contrib.eager.defun\n",
+        "  def call(self, x, training):\n",
+        "    # x shape == (bs, 256, 256, 3)    \n",
+        "    x1 = self.down1(x, training=training) # (bs, 128, 128, 64)\n",
+        "    x2 = self.down2(x1, training=training) # (bs, 64, 64, 128)\n",
+        "    x3 = self.down3(x2, training=training) # (bs, 32, 32, 256)\n",
+        "    x4 = self.down4(x3, training=training) # (bs, 16, 16, 512)\n",
+        "    x5 = self.down5(x4, training=training) # (bs, 8, 8, 512)\n",
+        "    x6 = self.down6(x5, training=training) # (bs, 4, 4, 512)\n",
+        "    x7 = self.down7(x6, training=training) # (bs, 2, 2, 512)\n",
+        "    x8 = self.down8(x7, training=training) # (bs, 1, 1, 512)\n",
+        "\n",
+        "    x9 = self.up1(x8, x7, training=training) # (bs, 2, 2, 1024)\n",
+        "    x10 = self.up2(x9, x6, training=training) # (bs, 4, 4, 1024)\n",
+        "    x11 = self.up3(x10, x5, training=training) # (bs, 8, 8, 1024)\n",
+        "    x12 = self.up4(x11, x4, training=training) # (bs, 16, 16, 1024)\n",
+        "    x13 = self.up5(x12, x3, training=training) # (bs, 32, 32, 512)\n",
+        "    x14 = self.up6(x13, x2, training=training) # (bs, 64, 64, 256)\n",
+        "    x15 = self.up7(x14, x1, training=training) # (bs, 128, 128, 128)\n",
+        "\n",
+        "    x16 = self.last(x15) # (bs, 256, 256, 3)\n",
+        "    x16 = tf.nn.tanh(x16)\n",
+        "\n",
+        "    return x16"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "ll6aNeQx8b4v"
+      },
+      "outputs": [],
+      "source": [
+        "class DiscDownsample(tf.keras.Model):\n",
+        "    \n",
+        "  def __init__(self, filters, size, apply_batchnorm=True):\n",
+        "    super(DiscDownsample, self).__init__()\n",
+        "    self.apply_batchnorm = apply_batchnorm\n",
+        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
+        "\n",
+        "    self.conv1 = tf.keras.layers.Conv2D(filters, \n",
+        "                                        (size, size), \n",
+        "                                        strides=2, \n",
+        "                                        padding='same',\n",
+        "                                        kernel_initializer=initializer,\n",
+        "                                        use_bias=False)\n",
+        "    if self.apply_batchnorm:\n",
+        "        self.batchnorm = tf.keras.layers.BatchNormalization()\n",
+        "  \n",
+        "  def call(self, x, training):\n",
+        "    x = self.conv1(x)\n",
+        "    if self.apply_batchnorm:\n",
+        "        x = self.batchnorm(x, training=training)\n",
+        "    x = tf.nn.leaky_relu(x)\n",
+        "    return x \n",
+        "\n",
+        "class Discriminator(tf.keras.Model):\n",
+        "    \n",
+        "  def __init__(self):\n",
+        "    super(Discriminator, self).__init__()\n",
+        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
+        "    \n",
+        "    self.down1 = DiscDownsample(64, 4, False)\n",
+        "    self.down2 = DiscDownsample(128, 4)\n",
+        "    self.down3 = DiscDownsample(256, 4)\n",
+        "    \n",
+        "    # we are zero padding here with 1 because we need our shape to \n",
+        "    # go from (batch_size, 32, 32, 256) to (batch_size, 31, 31, 512)\n",
+        "    self.zero_pad1 = tf.keras.layers.ZeroPadding2D()\n",
+        "    self.conv = tf.keras.layers.Conv2D(512, \n",
+        "                                       (4, 4), \n",
+        "                                       strides=1, \n",
+        "                                       kernel_initializer=initializer, \n",
+        "                                       use_bias=False)\n",
+        "    self.batchnorm1 = tf.keras.layers.BatchNormalization()\n",
+        "    \n",
+        "    # shape change from (batch_size, 31, 31, 512) to (batch_size, 30, 30, 1)\n",
+        "    self.zero_pad2 = tf.keras.layers.ZeroPadding2D()\n",
+        "    self.last = tf.keras.layers.Conv2D(1, \n",
+        "                                       (4, 4), \n",
+        "                                       strides=1,\n",
+        "                                       kernel_initializer=initializer)\n",
+        "  \n",
+        "  @tf.contrib.eager.defun\n",
+        "  def call(self, inp, tar, training):\n",
+        "    # concatenating the input and the target\n",
+        "    x = tf.concat([inp, tar], axis=-1) # (bs, 256, 256, channels*2)\n",
+        "    x = self.down1(x, training=training) # (bs, 128, 128, 64)\n",
+        "    x = self.down2(x, training=training) # (bs, 64, 64, 128)\n",
+        "    x = self.down3(x, training=training) # (bs, 32, 32, 256)\n",
+        "\n",
+        "    x = self.zero_pad1(x) # (bs, 34, 34, 256)\n",
+        "    x = self.conv(x)      # (bs, 31, 31, 512)\n",
+        "    x = self.batchnorm1(x, training=training)\n",
+        "    x = tf.nn.leaky_relu(x)\n",
+        "    \n",
+        "    x = self.zero_pad2(x) # (bs, 33, 33, 512)\n",
+        "    # don't add a sigmoid activation here since\n",
+        "    # the loss function expects raw logits.\n",
+        "    x = self.last(x)      # (bs, 30, 30, 1)\n",
+        "\n",
+        "    return x"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gDkA05NE6QMs"
+      },
+      "outputs": [],
+      "source": [
+        "# The call function of Generator and Discriminator have been decorated\n",
+        "# with tf.contrib.eager.defun()\n",
+        "# We get a performance speedup if defun is used (~25 seconds per epoch)\n",
+        "generator = Generator()\n",
+        "discriminator = Discriminator()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "0FMYgY_mPfTi"
+      },
+      "source": [
+        "## Define the loss functions and the optimizer\n",
+        "\n",
+        "* **Discriminator loss**\n",
+        "  * The discriminator loss function takes 2 inputs; **real images, generated images**\n",
+        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones(since these are the real images)**\n",
+        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros(since these are the fake images)**\n",
+        "  * Then the total_loss is the sum of real_loss and the generated_loss\n",
+        "  \n",
+        "* **Generator loss**\n",
+        "  * It is a sigmoid cross entropy loss of the generated images and an **array of ones**.\n",
+        "  * The [paper](https://arxiv.org/abs/1611.07004) also includes L1 loss which is MAE (mean absolute error) between the generated image and the target image.\n",
+        "  * This allows the generated image to become structurally similar to the target image.\n",
+        "  * The formula to calculate the total generator loss = gan_loss + LAMBDA * l1_loss, where LAMBDA = 100. This value was decided by the authors of the [paper](https://arxiv.org/abs/1611.07004)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "cyhxTuvJyIHV"
+      },
+      "outputs": [],
+      "source": [
+        "LAMBDA = 100"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "wkMNfBWlT-PV"
+      },
+      "outputs": [],
+      "source": [
+        "def discriminator_loss(disc_real_output, disc_generated_output):\n",
+        "  real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_real_output), \n",
+        "                                              logits = disc_real_output)\n",
+        "  generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.zeros_like(disc_generated_output), \n",
+        "                                                   logits = disc_generated_output)\n",
+        "\n",
+        "  total_disc_loss = real_loss + generated_loss\n",
+        "\n",
+        "  return total_disc_loss"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "90BIcCKcDMxz"
+      },
+      "outputs": [],
+      "source": [
+        "def generator_loss(disc_generated_output, gen_output, target):\n",
+        "  gan_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_generated_output),\n",
+        "                                             logits = disc_generated_output) \n",
+        "  # mean absolute error\n",
+        "  l1_loss = tf.reduce_mean(tf.abs(target - gen_output))\n",
+        "\n",
+        "  total_gen_loss = gan_loss + (LAMBDA * l1_loss)\n",
+        "\n",
+        "  return total_gen_loss"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "iWCn_PVdEJZ7"
+      },
+      "outputs": [],
+      "source": [
+        "generator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)\n",
+        "discriminator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "aKUZnDiqQrAh"
+      },
+      "source": [
+        "## Checkpoints (Object-based saving)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "WJnftd5sQsv6"
+      },
+      "outputs": [],
+      "source": [
+        "checkpoint_dir = './training_checkpoints'\n",
+        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
+        "                                 discriminator_optimizer=discriminator_optimizer,\n",
+        "                                 generator=generator,\n",
+        "                                 discriminator=discriminator)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Rw1fkAczTQYh"
+      },
+      "source": [
+        "## Training\n",
+        "\n",
+        "* We start by iterating over the dataset\n",
+        "* The generator gets the input image and we get a generated output.\n",
+        "* The discriminator receives the input_image and the generated image as the first input. The second input is the input_image and the target_image.\n",
+        "* Next, we calculate the generator and the discriminator loss.\n",
+        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables(inputs) and apply those to the optimizer.\n",
+        "\n",
+        "## Generate Images\n",
+        "\n",
+        "* After training, its time to generate some images!\n",
+        "* We pass images from the test dataset to the generator.\n",
+        "* The generator will then translate the input image into the output we expect.\n",
+        "* Last step is to plot the predictions and **voila!**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "NS2GWywBbAWo"
+      },
+      "outputs": [],
+      "source": [
+        "EPOCHS = 200"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "RmdVsmvhPxyy"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_images(model, test_input, tar):\n",
+        "  # the training=True is intentional here since\n",
+        "  # we want the batch statistics while running the model\n",
+        "  # on the test dataset. If we use training=False, we will get \n",
+        "  # the accumulated statistics learned from the training dataset\n",
+        "  # (which we don't want)\n",
+        "  prediction = model(test_input, training=True)\n",
+        "  plt.figure(figsize=(15,15))\n",
+        "\n",
+        "  display_list = [test_input[0], tar[0], prediction[0]]\n",
+        "  title = ['Input Image', 'Ground Truth', 'Predicted Image']\n",
+        "\n",
+        "  for i in range(3):\n",
+        "    plt.subplot(1, 3, i+1)\n",
+        "    plt.title(title[i])\n",
+        "    # getting the pixel values between [0, 1] to plot it.\n",
+        "    plt.imshow(display_list[i] * 0.5 + 0.5)\n",
+        "    plt.axis('off')\n",
+        "  plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "2M7LmLtGEMQJ"
+      },
+      "outputs": [],
+      "source": [
+        "def train(dataset, epochs):  \n",
+        "  for epoch in range(epochs):\n",
+        "    start = time.time()\n",
+        "\n",
+        "    for input_image, target in dataset:\n",
+        "\n",
+        "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
+        "        gen_output = generator(input_image, training=True)\n",
+        "\n",
+        "        disc_real_output = discriminator(input_image, target, training=True)\n",
+        "        disc_generated_output = discriminator(input_image, gen_output, training=True)\n",
+        "\n",
+        "        gen_loss = generator_loss(disc_generated_output, gen_output, target)\n",
+        "        disc_loss = discriminator_loss(disc_real_output, disc_generated_output)\n",
+        "\n",
+        "      generator_gradients = gen_tape.gradient(gen_loss, \n",
+        "                                              generator.variables)\n",
+        "      discriminator_gradients = disc_tape.gradient(disc_loss, \n",
+        "                                                   discriminator.variables)\n",
+        "\n",
+        "      generator_optimizer.apply_gradients(zip(generator_gradients, \n",
+        "                                              generator.variables))\n",
+        "      discriminator_optimizer.apply_gradients(zip(discriminator_gradients, \n",
+        "                                                  discriminator.variables))\n",
+        "\n",
+        "    if epoch % 1 == 0:\n",
+        "        clear_output(wait=True)\n",
+        "        for inp, tar in test_dataset.take(1):\n",
+        "          generate_images(generator, inp, tar)\n",
+        "          \n",
+        "    # saving (checkpoint) the model every 20 epochs\n",
+        "    if (epoch + 1) % 20 == 0:\n",
+        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
+        "\n",
+        "    print ('Time taken for epoch {} is {} sec\\n'.format(epoch + 1,\n",
+        "                                                        time.time()-start))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "a1zZmKmvOH85"
+      },
+      "outputs": [],
+      "source": [
+        "train(train_dataset, EPOCHS)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "kz80bY3aQ1VZ"
+      },
+      "source": [
+        "## Restore the latest checkpoint and test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "4t4x69adQ5xb"
+      },
+      "outputs": [],
+      "source": [
+        "# restoring the latest checkpoint in checkpoint_dir\n",
+        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1RGysMU_BZhx"
+      },
+      "source": [
+        "## Testing on the entire test dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "KUgSnmy2nqSP"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the trained model on the entire test dataset\n",
+        "for inp, tar in test_dataset:\n",
+        "  generate_images(generator, inp, tar)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "3AJXOByaZVOf"
+      },
+      "outputs": [],
+      "source": [
+        ""
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "pix2pix_eager.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1eb0NOTQapkYs3X0v-zL1x5_LFKgDISnp",
+          "timestamp": 1527173385672
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/revnet/README.md b/tensorflow/contrib/eager/python/examples/revnet/README.md
index 2875d0f..822d86e 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/README.md
+++ b/tensorflow/contrib/eager/python/examples/revnet/README.md
@@ -1,6 +1,6 @@
 # RevNet with TensorFlow eager execution
 
-This folder contains a TensorFlow eager implementation of the [Reversible Residual Network](https://arxiv.org/pdf/1707.04585.pdf) adapted from the released implementation by the authors. The presented implementation can be ran both in eager and graph mode. The code is considerably simplified with `tf.GradientTape`. Moreover, we reduce the step of reconstructing the outputs. This saves us from using `tf.stop_gradient` and makes the model run faster.
+This folder contains a TensorFlow eager implementation of the [Reversible Residual Network](https://arxiv.org/pdf/1707.04585.pdf) adapted from the released implementation by the authors. The presented implementation can be ran with both eager and graph execution. The code is considerably simplified with `tf.GradientTape`. Moreover, we reduce the a redundant forward pass in the implementation by the authors. This saves us from using `tf.stop_gradient` and makes the model run faster.
 
 ##  Content
 
@@ -16,7 +16,7 @@
 - `resnet_preprocessing.py`, `imagenet_input.py`: Boilerplate to read ImageNet data from TFRecords.
 
 ## Train on CIFAR-10/CIFAR-100
-- Make sure you have installed TensorFlow 1.9+ or the latest `tf-nightly`
+- Make sure you have installed TensorFlow 1.10+ or the latest `tf-nightly`
 or `tf-nightly-gpu` pip package in order to access the eager execution feature.
 
 - First run
@@ -41,11 +41,13 @@
   - `config`: RevNet configuration.
   - `use_defun`: Use `tfe.defun` to boost performance.
 
-- To train a model with estimators in graph-mode, run
+- To train a model with estimators in graph execution, run
 
 ```bash
 python main_estimator.py --data_dir ${PWD}/cifar
 ```
+To ensure our code works properly when using the Keras model in an estimator,
+`tf-nightly` or `tf-nightly-gpu` is highly recommended as of August 2018.
 
 - Optional arguments for `main.py` include
   - `model_dir`: Directory to store eventfiles and checkpoints.
@@ -54,13 +56,19 @@
   - `export`: Export the model for serving if True.
 
 ## Speed up with `tfe.defun`
-Even though the speed difference between pure eager execution and graph-mode execution is noticeable,
-the difference between fully "defunned" model training and graph-mode
+To ensure that `tf.contrib.eager.defun` in our code works properly with all
+part of the model during training, the latest `tf-nightly` or `tf-nightly-gpu`
+is highly recommended as of August 2018.
+
+Even though the speed difference between pure eager execution and graph execution is noticeable,
+the difference between fully "defunned" model training and graph
 training is negligible.
 
 ## Train on ImageNet with Cloud TPUs
-The standard way to train models on Cloud TPUs is via TPU estimators and graph-mode
+The standard way to train models on Cloud TPUs is via TPU estimators and graph
 execution. Models built with the `tf.keras` API are fully compatible with TPU estimators.
+To ensure our code works properly in this setting,
+`tf-nightly` or `tf-nightly-gpu` is highly recommended as of August 2018.
 
 ### Setup a Google Cloud project
 
@@ -96,7 +104,8 @@
 ```
 
 ## Performance
-- With the current implementation, RevNet-38 achieves >92% on CIFAR-10 and >71% on CIFAR-100.
+- RevNet-38 achieves >92% and >71% accuracy on CIFAR-10 and CIFAR-100 respectively.
+- RevNet-56 achieves <26% top-1 error rate on ImageNet.
 
 ## Reference
 The Reversible Residual Network: Backpropagation Without Storing Activations.
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
index fda9020..9ff6b60 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
@@ -188,6 +188,40 @@
       self._check_grad_angle(dx_true, dx)
       self._check_grad_angle(dw_true, dw)
 
+  def test_backward_grads_with_nativepy(self):
+    if not tf.test.is_gpu_available():
+      self.skipTest("GPU not available")
+
+    input_shape = (128, 8, 8)
+    data_shape = (16,) + input_shape
+    x = tf.random_normal(shape=data_shape, dtype=tf.float64)
+    dy = tf.random_normal(shape=data_shape, dtype=tf.float64)
+    dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=1)
+    block = blocks.RevBlock(
+        n_res=3,
+        filters=128,
+        strides=(1, 1),
+        input_shape=input_shape,
+        fused=False,
+        dtype=tf.float64)
+    with tf.GradientTape() as tape:
+      tape.watch(x)
+      x1, x2 = tf.split(x, num_or_size_splits=2, axis=1)
+      y1, y2 = block((x1, x2), training=True)
+      y = tf.concat((y1, y2), axis=1)
+
+    # Compute true grads
+    dx_true = tape.gradient(y, x, output_gradients=dy)
+
+    # Compute grads from reconstruction
+    (dx1, dx2), _ = block.backward_grads(
+        x=(x1, x2), y=(y1, y2), dy=(dy1, dy2), training=True)
+    dx = tf.concat((dx1, dx2), axis=1)
+
+    thres = 1e-5
+    diff_abs = tf.reshape(abs(dx - dx_true), [-1])
+    assert all(diff_abs < thres)
+
 
 class _ResidualTest(tf.test.TestCase):
 
diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
new file mode 100644
index 0000000..76f48ee
--- /dev/null
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -0,0 +1,178 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for remote eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import numpy as np
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+JOB_NAME = "remote_device"
+ALT_JOB_NAME = "alt_remote_device"
+
+
+def run_sync_and_async(f):
+  """Execute all test methods in the given class in sync and async modes."""
+
+  @functools.wraps(f)
+  def decorator(self, *args, **kwargs):
+    with context.execution_mode(context.ASYNC):
+      f(self, *args, **kwargs)
+
+    with context.execution_mode(context.SYNC):
+      f(self, *args, **kwargs)
+
+  return decorator
+
+
+def get_server_def(job_name, local_server_port, remote_server_addresses,
+                   task_index):
+  """Returns a server def with a single job + multiple tasks."""
+  cluster_def = cluster_pb2.ClusterDef()
+  job_def = cluster_def.job.add()
+  job_def.name = job_name
+  job_def.tasks[0] = "localhost:%d" % local_server_port
+
+  for i, remote_server_address in enumerate(remote_server_addresses, start=1):
+    job_def.tasks[i] = remote_server_address
+
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_def,
+      job_name=job_name,
+      task_index=task_index,
+      protocol="grpc")
+
+  return server_def
+
+
+class RemoteExecutionTest(test.TestCase):
+
+  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
+    super(RemoteExecutionTest, self).__init__(methodName)
+    self._cached_server1 = server_lib.Server.create_local_server()
+    self._cached_server2 = server_lib.Server.create_local_server()
+
+    os.environ["TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC"] = "1"
+
+    self._cached_server1_target = self._cached_server1.target[len("grpc://"):]
+    self._cached_server2_target = self._cached_server2.target[len("grpc://"):]
+
+    # Start the local server.
+    context.set_server_def(
+        server_def=get_server_def(
+            JOB_NAME,
+            local_server_port=0,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+  @run_sync_and_async
+  def testDefunMatmul(self):
+    """Basic remote eager execution with defun."""
+
+    mm_defun = function.defun(math_ops.matmul)
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME):
+      x2 = array_ops.ones([2, 2])
+      y = mm_defun(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @run_sync_and_async
+  def testSimpleMatmul(self):
+    """Basic remote eager execution."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME):
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @run_sync_and_async
+  def testSimpleWeightRead(self):
+    """Basic remote eager weight read."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      w = resource_variable_ops.ResourceVariable([[2.0]])
+      loss = w * w
+    np.testing.assert_array_equal([[4.0]], loss.numpy())
+
+  @run_sync_and_async
+  def testTapeWeightRead(self):
+    """Remote eager weight read in a tape."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      w = resource_variable_ops.ResourceVariable([[3.0]])
+      with backprop.GradientTape() as tape:
+        loss = w * w
+
+      grad = tape.gradient(loss, w)
+    np.testing.assert_array_equal([[9.0]], loss.numpy())
+    np.testing.assert_array_equal([[6.0]], grad.numpy())
+
+  @run_sync_and_async
+  def testServerDefChanged(self):
+    """Update server def, and run ops on new cluster."""
+    context.set_server_def(
+        server_def=get_server_def(
+            ALT_JOB_NAME,
+            local_server_port=0,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % ALT_JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Set the server def back to JOB_NAME
+    context.set_server_def(
+        server_def=get_server_def(
+            JOB_NAME,
+            local_server_port=0,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index d709308..f9c7163 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -161,7 +161,7 @@
     Args:
       file_prefix: Path prefix where parameters were previously saved.
         Typically obtained from a previous `save()` call, or from
-        @{tf.train.latest_checkpoint}.
+        `tf.train.latest_checkpoint`.
     """
     with ops.device("/device:CPU:0"):
       self._saver.restore(None, file_prefix)
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index de11d00..4dfd083 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -16,7 +16,7 @@
 
 EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
-To use, at program startup, call `tfe.enable_eager_execution()`.
+To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@metrics
 
@@ -67,6 +67,7 @@
 @@execution_mode
 @@async_wait
 @@async_clear_error
+@@set_server_def
 
 @@run_test_in_graph_and_eager_modes
 @@run_all_tests_in_graph_and_eager_modes
@@ -110,6 +111,7 @@
 from tensorflow.python.eager.context import SYNC
 from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
+from tensorflow.python.eager.context import set_server_def
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
 from tensorflow.python.eager.execution_callbacks import inf_callback
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 349f48f..77f62df 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -20,6 +20,7 @@
         ":dnn_linear_combined",
         ":early_stopping",
         ":export",
+        ":exporter",
         ":extenders",
         ":head",
         ":hooks",
@@ -220,6 +221,33 @@
 )
 
 py_library(
+    name = "exporter",
+    srcs = [
+        "python/estimator/exporter.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:exporter",
+    ],
+)
+
+py_test(
+    name = "exporter_test",
+    size = "medium",
+    srcs = ["python/estimator/exporter_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":exporter",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:exporter",
+    ],
+)
+
+py_library(
     name = "head",
     srcs = [
         "python/estimator/head.py",
@@ -487,6 +515,9 @@
     size = "medium",
     srcs = ["python/estimator/saved_model_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "notsan",
+    ],
     deps = [
         ":export",
         ":saved_model_estimator",
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index e1453ae..258860f 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -45,6 +45,7 @@
     'clip_gradients_by_norm',
     'forward_features',
     'InMemoryEvaluatorHook',
+    'make_stop_at_checkpoint_step_hook',
     'logistic_regression_head',
     'multi_class_head',
     'multi_head',
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
index 2eef60c..724bc2c 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
@@ -147,7 +147,7 @@
         if a categorical column is multivalent.  One of "mean", "sqrtn", and
         "sum" -- these are effectively different ways to do example-level
         normalization, which can be useful for bag-of-words features.  For more
-        details, see @{tf.feature_column.linear_model$linear_model}.
+        details, see `tf.feature_column.linear_model`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
diff --git a/tensorflow/contrib/estimator/python/estimator/export.py b/tensorflow/contrib/estimator/python/estimator/export.py
index 03cf6f1..b0deb9b 100644
--- a/tensorflow/contrib/estimator/python/estimator/export.py
+++ b/tensorflow/contrib/estimator/python/estimator/export.py
@@ -31,8 +31,8 @@
   # pylint: disable=line-too-long
   """Exports a single train/eval/predict graph as a SavedModel.
 
-  For a detailed guide, see
-  @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+  For a detailed guide, see [Using SavedModel with Estimators](
+  https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
 
   Sample usage:
   ```python
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter.py b/tensorflow/contrib/estimator/python/estimator/exporter.py
new file mode 100644
index 0000000..09d7440
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/exporter.py
@@ -0,0 +1,280 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements StepsExporter to export the model in user specified steps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.estimator import exporter
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.summary import summary_iterator
+
+DEFAULT_GLOBAL_STEP_KEY = ops.GraphKeys.GLOBAL_STEP
+
+
+class StepsExporter(exporter.Exporter):
+  """This class exports the model in user specified steps.
+
+  This class exports the model at the steps given by the `steps_to_keep`
+  argument. Each number in the list is treated as a lower bound for model
+  exports, to handle the case when evaluation is performed at different steps.
+
+  Consider this example:
+
+  ```
+  steps_to_keep = [1, 2, 3, 6, 7, 10, 12, 25]
+  ```
+
+  The model is evaluated at step increments of 5: `[5, 10, 15, 20, 25, 30]`.
+  The `StepsExporter` will export the model when it has reached steps
+  `[5, 10, 15, 25]`.
+
+  This example illustrates the two cases when the model is exported:
+
+  1. Model is evaluated on a step defined in the list `steps_to_keep`.
+
+     In the example, the model is exported on step `10` and `25`.
+
+  2. Model is evaluated on a step not defined in the list `steps_to_keep`, but
+     is still exported because a step in `steps_to_keep` was missed.
+
+     In the example, when the model reaches step `5`, the model is exported even
+     though  `steps_to_keep` does not contain `5`. Step `5` is exported to make
+     up for step `3`, which was missed. Steps `1` and `2` in `steps_to_keep` are
+     skipped completely (e.g. say the model is evaluated at step `6`. It will
+     **not** be exported to make up for step `2`).
+
+  Using the `steps_to_keep` list as a lower bound allows users to define
+  approximate step boundaries for exporting their models, and avoid frustrating
+  off-by-one calculation errors.
+
+  Sample Use Cases:
+    There are specific points during the training when having a saved version of
+    the model would be useful. One example is at the end of each training phase
+    when the set of freezed weights is changed.
+    Another good use case is saving the model at the end of each epoch for
+    visualization or retraining.
+  """
+
+  def __init__(self,
+               steps_to_keep,
+               name='steps_exporter',
+               serving_input_receiver_fn=None,
+               event_file_pattern='eval/*.tfevents.*',
+               assets_extra=None,
+               as_text=False):
+    """Create an `StepsExporter` to use with `tf.estimator.EvalSpec`.
+
+    Example of creating a StepsExporter for training and evaluation:
+
+    ```python
+    categorical_feature_a = categorical_column_with_hash_bucket(...)
+    categorical_feature_b = categorical_column_with_hash_bucket(...)
+
+    categorical_feature_a_emb = embedding_column(
+        categorical_column=categorical_feature_a, ...)
+    categorical_feature_b_emb = embedding_column(
+        categorical_column=categorical_feature_b, ...)
+
+    estimator = tf.estimator.DNNClassifier(
+        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+        hidden_units=[1024, 512, 256])
+
+    # Input pipeline for train and evaluate.
+    def train_input_fn: # returns x, y
+      # please shuffle the data.
+      pass
+    def eval_input_fn_eval: # returns x, y
+      pass
+
+    exporter = tf.contrib.estimator.exporter.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=serving_input_receiver_fn,
+        event_file_pattern='eval/*.tfevents.*'
+        steps_to_keep=[...])
+
+    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
+
+    eval_spec = [tf.estimator.EvalSpec(
+      input_fn=eval_input_fn,
+      steps=1,
+      exporters=exporter,
+      start_delay_secs=0,
+      throttle_secs=5)]
+
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+
+    # Models will be exported to estimator.model_dir in timestamped directories,
+    # which can be used for serving, analysis with TFMA, or directly loaded in.
+    # For example:
+    export_dir = os.path.join(estimator.model_dir,
+                              <timestamped directory name>)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        tf.saved_model.loader.load(
+            sess, [tf.saved_model.tag_constants.SERVING], export_dir)
+
+    ```
+
+    Args:
+      steps_to_keep: Non-empty list of positive integers containing
+        the step numbers at which the model should be exported. All the exports
+        will be kept, so there is no garbage collection.
+      name: Unique name of this `Exporter` that is going to be used in the
+        export path.
+      serving_input_receiver_fn: A function that takes no arguments and returns
+        a `ServingInputReceiver`.
+      event_file_pattern: Event file name pattern relative to model_dir. If
+        None, however, the exporter would not be preemption-safe. To be
+        preemption-safe, event_file_pattern should be specified.
+      assets_extra: An optional dict specifying how to populate the assets.extra
+        directory within the exported SavedModel.  Each key should give the
+        destination path (including the filename) relative to the assets.extra
+        directory.  The corresponding value gives the full path of the source
+        file to be copied.  For example, the simple case of copying a single
+        file without renaming it is specified as `{'my_asset_file.txt':
+        '/path/to/my_asset_file.txt'}`.
+      as_text: Whether to write the SavedModel proto in text format. Defaults to
+        `False`.
+
+    Raises:
+      ValueError: If any arguments is invalid.
+    """
+    # pylint: disable=protected-access
+    self._saved_model_exporter = exporter._SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
+    # pylint: enable=protected-access
+
+    self._event_file_pattern = event_file_pattern
+    self._model_dir = None
+
+    self._input_steps_to_keep = steps_to_keep
+    steps_to_keep = [step for step in steps_to_keep if isinstance(step, int)]
+    steps_to_keep = [step for step in steps_to_keep if step > 0]
+    if not steps_to_keep:
+      raise ValueError(
+          '`steps_to_keep` list must have at least one positive integer')
+    elif self._input_steps_to_keep != steps_to_keep:
+      tf_logging.warn('Changed `steps_to_keep`, by omitting non-integer or'
+                      ' less than 1 elements, to [%s]',
+                      ', '.join(str(step) for step in steps_to_keep))
+    self._steps_to_keep = sorted(steps_to_keep)
+    self._steps_kept = []
+
+  @property
+  def name(self):
+    return self._saved_model_exporter.name
+
+  def export(self, estimator, export_path, checkpoint_path, eval_result,
+             is_the_final_export):
+    """Exports the given Estimator to a specific format.
+
+    Args:
+      estimator: A `tf.estimator.Estimator` instance to export.
+      export_path: A string containing a directory where to write the export.
+      checkpoint_path: The checkpoint path to export.
+      eval_result: The output of Estimator.evaluate on this checkpoint.
+      is_the_final_export: This boolean is True when this is an export in the
+        end of training. It is False for the intermediate exports during the
+        training. When passing Exporter to tf.estimator.train_and_evaluate
+        is_the_final_export is always False if TrainSpec.max_steps is None.
+
+    Returns:
+      The string path to the exported directory or None if export is skipped.
+
+    Raises:
+      ValueError: If `eval_result` is None or doesn't have
+        `ops.GraphKeys.GLOBAL_STEP` as a key.
+    """
+    export_result = None
+
+    if not eval_result or DEFAULT_GLOBAL_STEP_KEY not in eval_result:
+      raise ValueError(
+          '`eval_result` is empty, or does not have global step. This'
+          ' should never happen as Estimator always sets the global step in '
+          '`eval_result`. Please file a bug report. Got eval_result: %s'
+          % str(eval_result))
+
+    if self._model_dir != estimator.model_dir and self._event_file_pattern:
+      tf_logging.info('Loads the steps that the model was already evaluated at,'
+                      'from event files')
+      self._model_dir = estimator.model_dir
+      full_event_file_pattern = os.path.join(self._model_dir,
+                                             self._event_file_pattern)
+      self._steps_kept = self._get_kept_steps(full_event_file_pattern)
+
+      if self._steps_kept:
+        self._steps_kept = sorted(self._steps_kept)
+        self._steps_to_keep = [step for step in self._steps_to_keep if
+                               step > self._steps_kept[-1]]
+    # It is assumed that the model is exported at any evaluated step 'n' if
+    # there is any `steps_missed` lower than 'n'. As a result, all the steps in
+    # `_steps_to_keep` lower than the last evaluated step will be removed.
+    steps_missed = [step for step in self._steps_to_keep
+                    if step <= eval_result[DEFAULT_GLOBAL_STEP_KEY]]
+
+    if steps_missed:
+      # update the `_steps_to_keep` list by omitting all steps smaller than the
+      # current global step which are missed to be exported
+      export_result = self._saved_model_exporter.export(estimator, export_path,
+                                                        checkpoint_path,
+                                                        eval_result,
+                                                        is_the_final_export)
+      self._steps_to_keep = [step for step in self._steps_to_keep if step
+                             not in steps_missed]
+      # contains all the steps in which export has happened.
+      self._steps_kept.append(eval_result[DEFAULT_GLOBAL_STEP_KEY])
+      # Show warning for all the missed steps except the last one
+      if steps_missed[:-1]:
+        tf_logging.warn('Missed steps [%s] for exporting, as no evaluation'
+                        ' took place at them.', ', '.join(str(step) for step in
+                                                          steps_missed[:-1]))
+      # Log model export if the last missed step is the same as the current step
+      if steps_missed[-1] == eval_result[DEFAULT_GLOBAL_STEP_KEY]:
+        tf_logging.info('Performing model export at step %d.',
+                        eval_result[DEFAULT_GLOBAL_STEP_KEY])
+      # Show warning for exporting model at another step instead of the user
+      #   specified one
+      else:
+        tf_logging.warn('Performing model export at step %d instead of %d, as'
+                        ' no evaluation took place at step %d.',
+                        eval_result[DEFAULT_GLOBAL_STEP_KEY], steps_missed[-1],
+                        steps_missed[-1])
+    return export_result
+
+  def _get_kept_steps(self, event_files):
+    """Get the steps that the model was evaluated at, from event files.
+
+    Args:
+      event_files: Absolute pattern of event files.
+
+    Returns:
+      steps_kept: A list of steps in which the model was evaluated.
+    """
+    if not event_files:
+      return None
+
+    steps_kept = []
+    for event_file in gfile.Glob(os.path.join(event_files)):
+      for event in summary_iterator.summary_iterator(event_file):
+        if event.step not in steps_kept:
+          steps_kept.append(event.step)
+    return steps_kept
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter_test.py b/tensorflow/contrib/estimator/python/estimator/exporter_test.py
new file mode 100644
index 0000000..0d009b9
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/exporter_test.py
@@ -0,0 +1,206 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `StepsExporter`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import exporter as exporter_lib
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class StepsExporterTest(test.TestCase):
+
+  def test_error_out_if_steps_to_keep_has_no_positive_integers(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      exporter = exporter_lib.StepsExporter(
+          name="specified_steps_exporter",
+          serving_input_receiver_fn=_serving_input_receiver_fn,
+          steps_to_keep=[-1, 0, 1.1])
+      self.assertEqual("specified_steps_exporter", exporter.name)
+
+  def test_steps_exporter(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 1},
+                                    False)
+
+    self.assertEqual("export_result_path", export_result)
+    estimator.export_savedmodel.assert_called_with(
+        export_dir_base,
+        _serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_steps_exporter_with_preemption(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 2)
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1, 2, 6, 8])
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 3},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 6},
+                                    False)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 7},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_specified_step_is_saved(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1, 5, 8, 10, 11])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 1},
+                                    False)
+
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 2},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 5},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 10},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 15},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 20},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_steps_exporter_with_no_global_step_key(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    with self.assertRaisesRegexp(ValueError, "does not have global step"):
+      exporter.export(estimator, export_dir_base, "checkpoint_path", {}, False)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index bf08be0..26449b4 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -34,7 +34,7 @@
 
 
 def add_metrics(estimator, metric_fn):
-  """Creates a new @{tf.estimator.Estimator} which has given metrics.
+  """Creates a new `tf.estimator.Estimator` which has given metrics.
 
   Example:
 
@@ -61,7 +61,7 @@
   ```
 
   Args:
-    estimator: A @{tf.estimator.Estimator} object.
+    estimator: A `tf.estimator.Estimator` object.
     metric_fn: A function which should obey the following signature:
       - Args: can only have following four arguments in any order:
         * predictions: Predictions `Tensor` or dict of `Tensor` created by given
@@ -79,7 +79,7 @@
          function, namely a `(metric_tensor, update_op)` tuple.
 
   Returns:
-      A new @{tf.estimator.Estimator} which has a union of original metrics with
+      A new `tf.estimator.Estimator` which has a union of original metrics with
         given ones.
   """
   _verify_metric_fn_args(metric_fn)
@@ -165,14 +165,14 @@
   ```
 
   Args:
-    estimator: A @{tf.estimator.Estimator} object.
+    estimator: A `tf.estimator.Estimator` object.
     keys: a `string` or a `list` of `string`. If it is `None`, all of the
       `features` in `dict` is forwarded to the `predictions`. If it is a
       `string`, only given key is forwarded. If it is a `list` of strings, all
       the given `keys` are forwarded.
 
   Returns:
-      A new @{tf.estimator.Estimator} which forwards features to predictions.
+      A new `tf.estimator.Estimator` which forwards features to predictions.
 
   Raises:
     ValueError:
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
index caadafd..66c46e6 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import os
+import time
 
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.framework import ops
@@ -26,6 +27,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training
+from tensorflow.python.training import training_util
 
 
 # pylint: disable=protected-access
@@ -72,8 +74,9 @@
       estimator: A `tf.estimator.Estimator` instance to call evaluate.
       input_fn:  Equivalent to the `input_fn` arg to `estimator.evaluate`. A
         function that constructs the input data for evaluation.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Createing input functions](
+        https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
 
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
@@ -210,4 +213,72 @@
     self._evaluate(session)
 
 
+class _StopAtCheckpointStepHook(training.SessionRunHook):
+  """Hook that requests stop at a specified step based on checkpoint.
+
+  Note: We recommend using 'make_stop_at_checkpoint_step_hook` to get the proper
+  hook.
+  """
+
+  def __init__(self, model_dir, last_step,
+               wait_after_file_check_secs=30):
+    """Initializes a `StopAtCheckpointStepHook`.
+
+    This hook requests stop after a last step has been reached. It checks latest
+    checkpoint to verify last step is written on disk or not.
+
+    Args:
+      model_dir: Directory to read global step from latest checkpoint.
+      last_step: Step after which to stop.
+      wait_after_file_check_secs: Reading same file by many workers may create
+      I/O issues. To throttle that we will wait given secs after each read of
+      the file.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if last_step is None:
+      raise ValueError('last_step must be specified.')
+    if model_dir is None:
+      raise ValueError('model_dir must be specified.')
+
+    self._model_dir = model_dir
+    self._last_step = last_step
+    self._wait_after_file_check_secs = wait_after_file_check_secs
+
+  def begin(self):
+    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          'Global step should be created to use StopAtCheckpointStepHook.')
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return training.SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_values.results + 1
+    if global_step >= self._last_step:
+      # Check latest global step in the checkpoint to ensure that the targeted
+      # last step is written on disk.
+
+      step = estimator_lib._load_global_step_from_checkpoint_dir(
+          self._model_dir)
+      if step >= self._last_step:
+        run_context.request_stop()
+      else:
+        time.sleep(self._wait_after_file_check_secs)
+
+
+def make_stop_at_checkpoint_step_hook(estimator,
+                                      last_step,
+                                      wait_after_file_check_secs=30):
+  """Creates a proper StopAtCheckpointStepHook based on chief status."""
+
+  if estimator.config.is_chief:
+    return training.StopAtStepHook(last_step=last_step)
+  return _StopAtCheckpointStepHook(
+      model_dir=estimator.model_dir,
+      last_step=last_step,
+      wait_after_file_check_secs=wait_after_file_check_secs)
+
 # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
index ee88d5e..c6c6cad 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -21,8 +21,11 @@
 import glob
 import json
 import os
+import tempfile
+import time
 
 from tensorflow.contrib.estimator.python.estimator import hooks as hooks_lib
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator import run_config as run_config_lib
@@ -316,5 +319,85 @@
       estimator.train(input_fn, hooks=[evaluator])
 
 
+class StopAtCheckpointStepHookTest(test.TestCase):
+
+  def test_do_not_stop_if_checkpoint_is_not_there(self):
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=tempfile.mkdtemp(), last_step=10)
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertTrue(mock_sleep.called)
+        self.assertFalse(mon_sess.should_stop())
+
+  def test_do_not_stop_if_checkpoint_step_is_smaller(self):
+    model_dir = tempfile.mkdtemp()
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_nine = step.assign(9)
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=model_dir, last_step=10)
+      with tf_session.Session() as sess:
+        sess.run(assign_nine)
+        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertTrue(mock_sleep.called)
+        self.assertFalse(mon_sess.should_stop())
+
+  def test_stop_if_checkpoint_step_is_laststep(self):
+    model_dir = tempfile.mkdtemp()
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=model_dir, last_step=10)
+      with tf_session.Session() as sess:
+        sess.run(assign_ten)
+        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertFalse(mock_sleep.called)
+        self.assertTrue(mon_sess.should_stop())
+
+  def test_creates_regular_stop_at_step_hook_for_chief(self):
+    # by default an estimator is in chief mode
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[feature_column_lib.numeric_column('x')],
+        hidden_units=[3, 1])
+    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
+    self.assertIsInstance(hook, training.StopAtStepHook)
+    self.assertEqual(300, hook._last_step)
+
+  def test_creates_checkpoint_hook_for_workers(self):
+
+    class FakeWorkerConfig(estimator_lib.RunConfig):
+
+      @property
+      def is_chief(self):
+        return False
+
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[feature_column_lib.numeric_column('x')],
+        hidden_units=[3, 1],
+        config=FakeWorkerConfig())
+    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
+    self.assertIsInstance(hook, hooks_lib._StopAtCheckpointStepHook)
+    self.assertEqual(300, hook._last_step)
+    self.assertEqual(dnn.model_dir, hook._model_dir)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/contrib/estimator/python/estimator/linear.py
index 62a37ab..2b68f24 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear.py
+++ b/tensorflow/contrib/estimator/python/estimator/linear.py
@@ -121,7 +121,7 @@
         is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
         effectively different ways to do example-level normalization, which can
         be useful for bag-of-words features. for more details, see
-        @{tf.feature_column.linear_model$linear_model}.
+        `tf.feature_column.linear_model`.
     """
     def _model_fn(features, labels, mode, config):
       return linear_lib._linear_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index effec42..9e1f14f 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -65,7 +65,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
@@ -242,7 +242,7 @@
         "//tensorflow/python:platform_benchmark",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 9ffdd3b..f384d76 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -158,12 +158,12 @@
     return either `features` or, equivalently, `(features, None)`.
 
     Args:
-      features: The input points. See @{tf.estimator.Estimator}.
-      mode: See @{tf.estimator.Estimator}.
-      config: See @{tf.estimator.Estimator}.
+      features: The input points. See `tf.estimator.Estimator`.
+      mode: See `tf.estimator.Estimator`.
+      config: See `tf.estimator.Estimator`.
 
     Returns:
-      A @{tf.estimator.EstimatorSpec} (see @{tf.estimator.Estimator}) specifying
+      A `tf.estimator.EstimatorSpec` (see `tf.estimator.Estimator`) specifying
       this behavior:
         * `train_op`: Execute one mini-batch or full-batch run of Lloyd's
              algorithm.
@@ -188,7 +188,6 @@
     #   center.
     # is_initialized: scalar indicating whether the initial cluster centers
     #   have been chosen; see init_op.
-    # cluster_centers_var: a Variable containing the cluster centers.
     # init_op: an op to choose the initial cluster centers. A single worker
     #   repeatedly executes init_op until is_initialized becomes True.
     # training_op: an op that runs an iteration of training, either an entire
@@ -394,7 +393,7 @@
       relative_tolerance: A relative tolerance of change in the loss between
         iterations. Stops learning if the loss changes less than this amount.
         This may not work correctly if `use_mini_batch=True`.
-      config: See @{tf.estimator.Estimator}.
+      config: See `tf.estimator.Estimator`.
       feature_columns: An optionable iterable containing all the feature columns
         used by the model. All items in the set should be feature column
         instances that can be passed to `tf.feature_column.input_layer`. If this
@@ -431,7 +430,7 @@
     """Finds the index of the closest cluster center to each input point.
 
     Args:
-      input_fn: Input points. See @{tf.estimator.Estimator.predict}.
+      input_fn: Input points. See `tf.estimator.Estimator.predict`.
 
     Yields:
       The index of the closest cluster center for each input point.
@@ -447,7 +446,7 @@
     which returns the negative sum.
 
     Args:
-      input_fn: Input points. See @{tf.estimator.Estimator.evaluate}. Only one
+      input_fn: Input points. See `tf.estimator.Estimator.evaluate`. Only one
           batch is retrieved.
 
     Returns:
@@ -465,7 +464,7 @@
     sklearn function returns the Euclidean distance.
 
     Args:
-      input_fn: Input points. See @{tf.estimator.Estimator.predict}.
+      input_fn: Input points. See `tf.estimator.Estimator.predict`.
 
     Yields:
       The distances from each input point to each cluster center.
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee..3a756da 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -15,7 +15,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Working with audio using FFmpeg.
 
-See the @{$python/contrib.ffmpeg} guide.
+See the [FFMPEG](https://tensorflow.org/api_guides/python/contrib.ffmpeg) guide.
 
 @@decode_audio
 @@encode_audio
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 918a7e2..95f5ba9 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -15,7 +15,9 @@
 
 """Framework utilities.
 
-See the @{$python/contrib.framework} guide.
+See the
+[Contrib Framework](https://tensorflow.org/api_guides/python/contrib.framework)
+guide.
 
 @@assert_same_float_dtype
 @@assert_scalar
@@ -100,6 +102,8 @@
 
 @@BoundedTensorSpec
 @@TensorSpec
+
+@@RecordInput
 """
 
 from __future__ import absolute_import
@@ -119,6 +123,7 @@
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.data_flow_ops import RecordInput
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 5b15033..0a02e76 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -103,9 +103,8 @@
 
 
 def _add_op(op):
-  key = arg_scope_func_key(op)
-  if key not in _DECORATED_OPS:
-    _DECORATED_OPS[key] = _kwarg_names(op)
+  key_op = arg_scope_func_key(op)
+  _DECORATED_OPS[key_op] = _kwarg_names(op)
 
 
 @tf_contextlib.contextmanager
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index 4c3879d..bcafc1a 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -38,6 +38,12 @@
   """Some cool doc string."""
   return (args, a, b, c)
 
+@add_arg_scope
+def func4(x='x', y='y'):
+  if x:
+    pass
+  if y:
+    pass
 
 def _key_op(op):
   return getattr(op, '_key_op', str(op))
@@ -231,6 +237,15 @@
           self.assertTupleEqual(args, func2_args)
           self.assertDictEqual(kwargs, func2_kwargs)
 
+  def testAddArgScopeRaceCondition(self):
+    func4_kwargs = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h')
+    for i in range(4):
+        # redefine the function with different args
+      @add_arg_scope
+      def func4(a=1, b=2, c=3, d=4, e=5, f=6, g=7, h=8):
+        pass
+      self.assertTupleEqual(arg_scoped_arguments(func4), func4_kwargs)
+
   def testDocString(self):
     self.assertEqual(func3.__doc__, 'Some cool doc string.')
 
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index 72835c3..71ab755 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -325,6 +325,8 @@
 
   def _is_self_handle(self, x):
     """Check if the tensor `x` is the same Mutex as `self._handle`."""
+    if isinstance(x, ops.EagerTensor):
+      return x is self._handle
     return (x.op.type == "MutexV2"
             # blank shared_name means the op will create a unique one.
             and x.op.get_attr("shared_name")
@@ -365,8 +367,7 @@
             "(CriticalSection: %s) requested exclusive resource access "
             "of this resource.  Did you mean to call execute with keyword "
             "argument exclusive_resource_access=False?" %
-            (list(resource_intersection), self._handle.name,
-             sg.op.name, sg.handle.name))
+            (list(resource_intersection), self._handle, sg, sg.handle))
 
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 
diff --git a/tensorflow/contrib/framework/python/ops/script_ops.py b/tensorflow/contrib/framework/python/ops/script_ops.py
index 5d269fe..d5cb679 100644
--- a/tensorflow/contrib/framework/python/ops/script_ops.py
+++ b/tensorflow/contrib/framework/python/ops/script_ops.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Script Language Operators. See the @{$python/script_ops} guide.
+"""Script Language Operators.
 
 @@py_func
 """
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 322d5c3..a7acae8 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -241,13 +241,13 @@
     use_resource: If `True` use a ResourceVariable instead of a Variable.
     synchronization: Indicates when a distributed a variable will be
       aggregated. Accepted values are constants defined in the class
-      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `tf.VariableSynchronization`. By default the synchronization is set to
       `AUTO` and the current `DistributionStrategy` chooses
       when to synchronize. If `synchronization` is set to `ON_READ`,
       `trainable` must not be set to `True`.
     aggregation: Indicates how a distributed variable will be aggregated.
       Accepted values are constants defined in the class
-      @{tf.VariableAggregation}.
+      `tf.VariableAggregation`.
 
   Returns:
     The created or existing variable.
@@ -320,13 +320,13 @@
     use_resource: If `True` use a ResourceVariable instead of a Variable.
     synchronization: Indicates when a distributed a variable will be
       aggregated. Accepted values are constants defined in the class
-      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `tf.VariableSynchronization`. By default the synchronization is set to
       `AUTO` and the current `DistributionStrategy` chooses
       when to synchronize. If `synchronization` is set to `ON_READ`,
       `trainable` must not be set to `True`.
     aggregation: Indicates how a distributed variable will be aggregated.
       Accepted values are constants defined in the class
-      @{tf.VariableAggregation}.
+      `tf.VariableAggregation`.
 
   Returns:
     The created or existing variable.
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
index 7534f57..869e899 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -13,8 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
-#define THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#ifndef TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#define TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -62,4 +62,4 @@
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 053d4e3..9866fcc 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -424,9 +424,11 @@
         ":namedtuples",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/ops/losses",
     ],
 )
@@ -459,8 +461,7 @@
         ":train",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/estimator:head",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -477,7 +478,7 @@
         "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -497,8 +498,7 @@
         "//tensorflow/python:metrics",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -526,8 +526,7 @@
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 508f487..f9995bb 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -22,7 +22,9 @@
 from tensorflow.contrib.gan.python.eval.python import eval_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import util as loss_util
 from tensorflow.python.summary import summary
 
@@ -32,6 +34,7 @@
     'add_gan_model_summaries',
     'add_regularization_loss_summaries',
     'add_cyclegan_image_summaries',
+    'add_stargan_image_summaries'
 ]
 
 
@@ -179,6 +182,94 @@
       max_outputs=1)
 
 
+def add_stargan_image_summaries(stargan_model,
+                                num_images=2,
+                                display_diffs=False):
+  """Adds image summaries to see StarGAN image results.
+
+  If display_diffs is True, each image result has `2` rows and `num_domains + 1`
+  columns.
+  The first row looks like:
+    [original_image, transformed_to_domain_0, transformed_to_domain_1, ...]
+  The second row looks like:
+    [no_modification_baseline, transformed_to_domain_0-original_image, ...]
+  If display_diffs is False, only the first row is shown.
+
+  IMPORTANT:
+    Since the model originally does not transformed the image to every domains,
+    we will transform them on-the-fly within this function in parallel.
+
+  Args:
+    stargan_model: A StarGANModel tuple.
+    num_images: The number of examples/images to be transformed and shown.
+    display_diffs: Also display the difference between generated and target.
+
+  Raises:
+    ValueError: If input_data is not images.
+    ValueError: If input_data_domain_label is not rank 2.
+    ValueError: If dimension 2 of input_data_domain_label is not fully defined.
+  """
+
+  _assert_is_image(stargan_model.input_data)
+  stargan_model.input_data_domain_label.shape.assert_has_rank(2)
+  stargan_model.input_data_domain_label.shape[1:].assert_is_fully_defined()
+
+  num_domains = stargan_model.input_data_domain_label.get_shape().as_list()[-1]
+
+  def _build_image(image):
+    """Helper function to create a result for each image on the fly."""
+
+    # Expand the first dimension as batch_size = 1.
+    images = array_ops.expand_dims(image, axis=0)
+
+    # Tile the image num_domains times, so we can get all transformed together.
+    images = array_ops.tile(images, [num_domains, 1, 1, 1])
+
+    # Create the targets to 0, 1, 2, ..., num_domains-1.
+    targets = array_ops.one_hot(list(range(num_domains)), num_domains)
+
+    with variable_scope.variable_scope(
+        stargan_model.generator_scope, reuse=True):
+
+      # Add the original image.
+      output_images_list = [image]
+
+      # Generate the image and add to the list.
+      gen_images = stargan_model.generator_fn(images, targets)
+      gen_images_list = array_ops.split(gen_images, num_domains)
+      gen_images_list = [
+          array_ops.squeeze(img, axis=0) for img in gen_images_list
+      ]
+      output_images_list.extend(gen_images_list)
+
+      # Display diffs.
+      if display_diffs:
+        diff_images = gen_images - images
+        diff_images_list = array_ops.split(diff_images, num_domains)
+        diff_images_list = [
+            array_ops.squeeze(img, axis=0) for img in diff_images_list
+        ]
+        output_images_list.append(array_ops.zeros_like(image))
+        output_images_list.extend(diff_images_list)
+
+      # Create the final image.
+      final_image = eval_utils.image_reshaper(
+          output_images_list, num_cols=num_domains + 1)
+
+    # Reduce the first rank.
+    return array_ops.squeeze(final_image, axis=0)
+
+  summary.image(
+      'stargan_image_generation',
+      functional_ops.map_fn(
+          _build_image,
+          stargan_model.input_data[:num_images],
+          parallel_iterations=num_images,
+          back_prop=False,
+          swap_memory=True),
+      max_outputs=num_images)
+
+
 def add_gan_model_summaries(gan_model):
   """Adds typical GANModel summaries.
 
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 33d51bf..54a6f8d 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -18,7 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-
 from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.eval.python import summaries_impl as summaries
 from tensorflow.python.framework import ops
@@ -37,6 +36,10 @@
   return variable_scope.get_variable('dummy_d', initializer=2.0) * inputs
 
 
+def stargan_generator_model(inputs, _):
+  return generator_model(inputs)
+
+
 def get_gan_model():
   # TODO(joelshor): Find a better way of creating a variable scope.
   with variable_scope.variable_scope('generator') as gen_scope:
@@ -57,6 +60,31 @@
       discriminator_fn=discriminator_model)
 
 
+def get_stargan_model():
+  """Similar to get_gan_model()."""
+  # TODO(joelshor): Find a better way of creating a variable scope.
+  with variable_scope.variable_scope('discriminator') as dis_scope:
+    pass
+  with variable_scope.variable_scope('generator') as gen_scope:
+    return namedtuples.StarGANModel(
+        input_data=array_ops.ones([1, 2, 2, 3]),
+        input_data_domain_label=array_ops.ones([1, 2]),
+        generated_data=stargan_generator_model(
+            array_ops.ones([1, 2, 2, 3]), None),
+        generated_data_domain_target=array_ops.ones([1, 2]),
+        reconstructed_data=array_ops.ones([1, 2, 2, 3]),
+        discriminator_input_data_source_predication=array_ops.ones([1]),
+        discriminator_generated_data_source_predication=array_ops.ones([1]),
+        discriminator_input_data_domain_predication=array_ops.ones([1, 2]),
+        discriminator_generated_data_domain_predication=array_ops.ones([1, 2]),
+        generator_variables=None,
+        generator_scope=gen_scope,
+        generator_fn=stargan_generator_model,
+        discriminator_variables=None,
+        discriminator_scope=dis_scope,
+        discriminator_fn=discriminator_model)
+
+
 def get_cyclegan_model():
   with variable_scope.variable_scope('x2y'):
     model_x2y = get_gan_model()
@@ -143,6 +171,16 @@
     with self.test_session(use_gpu=True):
       summary.merge_all().eval()
 
+  def test_add_image_comparison_summaries_for_stargan(self):
+
+    summaries.add_stargan_image_summaries(get_stargan_model())
+
+    self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(variables.global_variables_initializer())
+      summary.merge_all().eval()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 03f52d2..9e5aea1 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -52,7 +52,6 @@
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 
-
 __all__ = [
     'gan_model',
     'infogan_model',
@@ -61,6 +60,7 @@
     'stargan_model',
     'gan_loss',
     'cyclegan_loss',
+    'stargan_loss',
     'gan_train_ops',
     'gan_train',
     'get_sequential_train_hooks',
@@ -646,8 +646,9 @@
         type(model))
 
   # Optionally create pooled model.
-  pooled_model = (_tensor_pool_adjusted_model(model, tensor_pool_fn) if
-                  tensor_pool_fn else model)
+  pooled_model = (
+      _tensor_pool_adjusted_model(model, tensor_pool_fn)
+      if tensor_pool_fn else model)
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
@@ -665,9 +666,10 @@
   if _use_aux_loss(mutual_information_penalty_weight):
     gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_info_loss = (gen_info_loss if tensor_pool_fn is None else
-                     tfgan_losses.mutual_information_penalty(
-                         pooled_model, add_summaries=add_summaries))
+    dis_info_loss = (
+        gen_info_loss
+        if tensor_pool_fn is None else tfgan_losses.mutual_information_penalty(
+            pooled_model, add_summaries=add_summaries))
     gen_loss += mutual_information_penalty_weight * gen_info_loss
     dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
diff --git a/tensorflow/contrib/graph_editor/__init__.py b/tensorflow/contrib/graph_editor/__init__.py
index 51b7f45..b2de2b9 100644
--- a/tensorflow/contrib/graph_editor/__init__.py
+++ b/tensorflow/contrib/graph_editor/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """TensorFlow Graph Editor.
 
-See the @{$python/contrib.graph_editor} guide.
+See the
+[Graph Editor](https://tensorflow.org/api_guides/python/contrib.graph_editor)
+guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 026a3d1..e79ccd8 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -129,7 +129,7 @@
       return None
 
 
-def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
+def copy_op_handler(info, op, new_inputs, copy_shape=False, nodedef_fn=None):
   """Copy a `tf.Operation`.
 
   Args:
diff --git a/tensorflow/contrib/hadoop/BUILD b/tensorflow/contrib/hadoop/BUILD
new file mode 100644
index 0000000..ccad31e
--- /dev/null
+++ b/tensorflow/contrib/hadoop/BUILD
@@ -0,0 +1,117 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_py_test",
+)
+
+filegroup(
+    name = "test_data",
+    srcs = glob(["python/kernel_tests/testdata/*"]),
+)
+
+py_library(
+    name = "hadoop",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_ops",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = [
+        ":dataset_kernels",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["dataset_ops"],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = ["kernels/hadoop_dataset_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+py_library(
+    name = "dataset_ops",
+    srcs = [
+        "python/ops/hadoop_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":hadoop_op_loader",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_dataset_ops",
+    out = "python/ops/gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/hadoop:dataset_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "dataset_ops_kernels",
+    deps = [
+        ":dataset_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "hadoop_op_loader",
+    srcs = ["python/ops/hadoop_op_loader.py"],
+    dso = ["//tensorflow/contrib/hadoop:_dataset_ops.so"],
+    kernels = [
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_dataset_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+tf_py_test(
+    name = "hadoop_test",
+    srcs = ["python/kernel_tests/hadoop_test.py"],
+    additional_deps = [
+        ":hadoop",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    data = [
+        ":test_data",
+    ],
+    tags = [
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/hadoop/__init__.py b/tensorflow/contrib/hadoop/__init__.py
new file mode 100644
index 0000000..abf8cd4
--- /dev/null
+++ b/tensorflow/contrib/hadoop/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sequence File Dataset.
+
+@@SequenceFileDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.hadoop.python.ops.hadoop_dataset_ops import SequenceFileDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "SequenceFileDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc
new file mode 100644
index 0000000..80b2d3e
--- /dev/null
+++ b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc
@@ -0,0 +1,340 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+namespace {
+
+static const size_t kSyncMarkerSize = 16;
+static const size_t kSequenceFileBufferSize = 1024 * 1024;
+
+class SequenceFileReader {
+ public:
+  explicit SequenceFileReader(RandomAccessFile* file)
+      : input_stream_(
+            new io::BufferedInputStream(file, kSequenceFileBufferSize)) {}
+
+  Status ReadHeader() {
+    string version;
+    TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &version));
+    if (version.substr(0, 3) != "SEQ" || version[3] != 6) {
+      return errors::InvalidArgument(
+          "sequence file header must starts with `SEQ6`, received \"",
+          version.substr(0, 3), static_cast<int>(version[3]), "\"");
+    }
+    TF_RETURN_IF_ERROR(ReadString(&key_class_name_));
+    TF_RETURN_IF_ERROR(ReadString(&value_class_name_));
+
+    // At the moment we only support `org.apache.hadoop.io.Text` for key/value.
+    // TODO (yongtang): Add more class name support.
+    if (key_class_name_ != "org.apache.hadoop.io.Text" ||
+        value_class_name_ != "org.apache.hadoop.io.Text") {
+      return errors::Unimplemented("key/value of '", key_class_name_, "/",
+                                   value_class_name_,
+                                   "' is currently not supported");
+    }
+
+    string buffer;
+    TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(2, &buffer));
+    compression_ = buffer[0];
+    block_compression_ = buffer[1];
+    if (compression_ || block_compression_) {
+      TF_RETURN_IF_ERROR(ReadString(&compression_codec_class_name_));
+    }
+
+    // At the moment no compression is supported.
+    // TODO (yongtang): Add compression support.
+    if (compression_ || block_compression_) {
+      return errors::Unimplemented("compression is currently not supported");
+    }
+
+    // Not interested in metadata for now.
+    uint32 num_metadata_pairs = 0;
+    TF_RETURN_IF_ERROR(ReadUInt32(&num_metadata_pairs));
+    if (num_metadata_pairs > 1024) {
+      return errors::InvalidArgument(
+          "sequence file metadata should have key value pairs < 1024,  "
+          "received ",
+          num_metadata_pairs);
+    }
+    for (int i = 0; i < num_metadata_pairs; i++) {
+      TF_RETURN_IF_ERROR(ReadString(nullptr));
+      TF_RETURN_IF_ERROR(ReadString(nullptr));
+    }
+
+    TF_RETURN_IF_ERROR(
+        input_stream_->ReadNBytes(kSyncMarkerSize, &sync_marker_));
+
+    return Status::OK();
+  }
+
+  Status ReadRecord(string* key, string* value) {
+    uint32 length = 0;
+    TF_RETURN_IF_ERROR(ReadUInt32(&length));
+    if (length == static_cast<uint32>(-1)) {
+      // Sync marker.
+      string sync_marker;
+      TF_RETURN_IF_ERROR(
+          input_stream_->ReadNBytes(kSyncMarkerSize, &sync_marker));
+      if (sync_marker != sync_marker_) {
+        return errors::InvalidArgument(
+            "sequence file should have sync marker \"", sync_marker_,
+            "\" at pos ", input_stream_->Tell() - kSyncMarkerSize,
+            ", received \"", sync_marker, "\"");
+      }
+      return ReadRecord(key, value);
+    }
+    uint32 key_length = 0;
+    TF_RETURN_IF_ERROR(ReadUInt32(&key_length));
+    if (key_length > length) {
+      return errors::InvalidArgument("key length (", key_length,
+                                     ") should be < record length (", length,
+                                     ")");
+    }
+    // At the moment we only support `org.apache.hadoop.io.Text` for key/value.
+    // TODO (yongtang): Expand supported format.
+    TF_RETURN_IF_ERROR(ReadString(key));
+    TF_RETURN_IF_ERROR(ReadString(value));
+    return Status::OK();
+  }
+
+  Status ReadString(string* value) {
+    int64 length = 0;
+    TF_RETURN_IF_ERROR(ReadVInt(&length));
+    if (value == nullptr) {
+      return input_stream_->SkipNBytes(length);
+    }
+    return input_stream_->ReadNBytes(length, value);
+  }
+
+  Status ReadUInt32(uint32* value) {
+    string buffer;
+    TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &buffer));
+    *value = ((static_cast<uint32>(buffer[0]) << 24) |
+              static_cast<uint32>(buffer[1]) << 16) |
+             (static_cast<uint32>(buffer[2]) << 8) |
+             static_cast<uint32>(buffer[3]);
+    return Status::OK();
+  }
+
+  Status ReadVInt(int64* value) {
+    string buffer;
+    TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(1, &buffer));
+    if (buffer[0] >= -112) {
+      *value = static_cast<int64>(buffer[0]);
+      return Status::OK();
+    }
+
+    int64 remaining = 0;
+    bool negative = false;
+    if (buffer[0] >= -120) {
+      remaining = static_cast<int64>(-112) - static_cast<int64>(buffer[0]);
+    } else {
+      remaining = static_cast<int64>(-120) - static_cast<int64>(buffer[0]);
+      negative = true;
+    }
+    buffer.clear();
+    TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(remaining, &buffer));
+
+    uint64 v = 0;
+    for (int i = 0; i < buffer.size(); i++) {
+      v = (v << 8) | static_cast<uint64>(buffer[i]);
+    }
+    if (negative) {
+      v = ~v;
+    }
+    *value = static_cast<int64>(v);
+    return Status::OK();
+  }
+
+  virtual ~SequenceFileReader() = default;
+
+ private:
+  std::unique_ptr<io::InputStreamInterface> input_stream_;
+  string key_class_name_;
+  string value_class_name_;
+  string sync_marker_;
+  bool compression_;
+  bool block_compression_;
+  string compression_codec_class_name_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SequenceFileReader);
+};
+class SequenceFileDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+  explicit SequenceFileDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    for (const DataType& dt : output_types_) {
+      OP_REQUIRES(ctx, dt == DT_STRING,
+                  errors::InvalidArgument(
+                      "Each element of `output_types_` must be one of: "
+                      "DT_STRING"));
+    }
+  }
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    *output = new Dataset(ctx, filenames, output_types_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const std::vector<string>& filenames,
+            const DataTypeVector& output_types)
+        : DatasetBase(DatasetContext(ctx)),
+          filenames_(filenames),
+          output_types_(output_types) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::SequenceFile")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}, {}});
+      return *shapes;
+    }
+
+    string DebugString() const override {
+      return "SequenceFileDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* filenames = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {filenames}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (reader_) {
+            string key, value;
+            Status status = reader_->ReadRecord(&key, &value);
+            if (!errors::IsOutOfRange(status)) {
+              TF_RETURN_IF_ERROR(status);
+
+              Tensor key_tensor(ctx->allocator({}), DT_STRING, {});
+              key_tensor.scalar<string>()() = key;
+              out_tensors->emplace_back(std::move(key_tensor));
+
+              Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
+              value_tensor.scalar<string>()() = value;
+              out_tensors->emplace_back(std::move(value_tensor));
+
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            ResetStreamsLocked();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented("SaveInternal is currently not supported");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "RestoreInternal is currently not supported");
+      }
+
+     private:
+      // Sets up SequenceFile streams to read from the topic at
+      // `current_file_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_file_index_ >= dataset()->filenames_.size()) {
+          return errors::InvalidArgument(
+              "current_file_index_:", current_file_index_,
+              " >= filenames_.size():", dataset()->filenames_.size());
+        }
+
+        // Actually move on to next file.
+        const string& filename = dataset()->filenames_[current_file_index_];
+        TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file_));
+        reader_.reset(new SequenceFileReader(file_.get()));
+        return reader_->ReadHeader();
+      }
+
+      // Resets all Hadoop SequenceFile streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        reader_.reset();
+        file_.reset();
+      }
+
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_ GUARDED_BY(mu_);
+      std::unique_ptr<SequenceFileReader> reader_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+    const DataTypeVector output_types_;
+  };
+  DataTypeVector output_types_;
+};
+}  // namespace
+
+REGISTER_KERNEL_BUILDER(Name("SequenceFileDataset").Device(DEVICE_CPU),
+                        SequenceFileDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/contrib/hadoop/ops/dataset_ops.cc
similarity index 61%
copy from tensorflow/compiler/xla/client/xla_client/xla_builder.h
copy to tensorflow/contrib/hadoop/ops/dataset_ops.cc
index ce2a8af..66ad549 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/contrib/hadoop/ops/dataset_ops.cc
@@ -13,9 +13,17 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-#include "tensorflow/compiler/xla/client/xla_builder.h"
+namespace tensorflow {
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+REGISTER_OP("SequenceFileDataset")
+    .Input("filenames: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
new file mode 100644
index 0000000..d796e43
--- /dev/null
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
@@ -0,0 +1,66 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for SequenceFileDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class SequenceFileDatasetTest(test.TestCase):
+
+  def test_sequence_file_dataset(self):
+    """Test case for SequenceFileDataset.
+
+    The file is generated with `org.apache.hadoop.io.Text` for key/value.
+    There are 25 records in the file with the format of:
+    key = XXX
+    value = VALUEXXX
+    where XXX is replaced as the line number (starts with 001).
+    """
+    filename = os.path.join(resource_loader.get_data_files_path(),
+                            "testdata", "string.seq")
+
+    filenames = constant_op.constant([filename], dtypes.string)
+    num_repeats = 2
+
+    dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat(
+        num_repeats)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(num_repeats):  # Dataset is repeated.
+        for i in range(25):  # 25 records.
+          v0 = b"%03d" % (i + 1)
+          v1 = b"VALUE%03d" % (i + 1)
+          self.assertEqual((v0, v1), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq b/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq
new file mode 100755
index 0000000..b717533
--- /dev/null
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq
Binary files differ
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
new file mode 100644
index 0000000..6e0e628
--- /dev/null
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SequenceFile Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.hadoop.python.ops import gen_dataset_ops
+from tensorflow.contrib.hadoop.python.ops import hadoop_op_loader  # pylint: disable=unused-import
+from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class SequenceFileDataset(Dataset):
+  """A Sequence File Dataset that reads the sequence file."""
+
+  def __init__(self, filenames):
+    """Create a `SequenceFileDataset`.
+
+    `SequenceFileDataset` allows a user to read data from a hadoop sequence
+    file. A sequence file consists of (key value) pairs sequentially. At
+    the moment, `org.apache.hadoop.io.Text` is the only serialization type
+    being supported, and there is no compression support.
+
+    For example:
+
+    ```python
+    dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq")
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    # Prints the (key, value) pairs inside a hadoop sequence file.
+    while True:
+      try:
+        print(sess.run(next_element))
+      except tf.errors.OutOfRangeError:
+        break
+    ```
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+    """
+    super(SequenceFileDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.sequence_file_dataset(
+        self._filenames, nest.flatten(self.output_types))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor, ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+
+  @property
+  def output_types(self):
+    return dtypes.string, dtypes.string
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py b/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py
new file mode 100644
index 0000000..6dbf125
--- /dev/null
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading hadoop ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 022e17d..693724b 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -71,6 +71,7 @@
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
     const Tensor& transform_t = ctx->input(1);
+    const Tensor& shape_t = ctx->input(2);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
     OP_REQUIRES(ctx,
@@ -81,11 +82,28 @@
                      ProjectiveGenerator<Device, T>::kNumParameters),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
+    OP_REQUIRES(ctx, shape_t.dims() == 1,
+                errors::InvalidArgument("output shape must be 1-dimensional",
+                                        shape_t.shape().DebugString()));
+    OP_REQUIRES(ctx, shape_t.NumElements() == 2,
+                errors::InvalidArgument("output shape must have two elements",
+                                        shape_t.shape().DebugString()));
+    auto shape_vec = shape_t.vec<int32>();
+    int32 out_height = shape_vec(0);
+    int32 out_width = shape_vec(1);
+    OP_REQUIRES(ctx, out_height > 0 && out_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            0,
+                            TensorShape({images_t.dim_size(0), out_height,
+                                         out_width, images_t.dim_size(3)}),
+                            &output_t));
+    auto output = output_t->tensor<T, 4>();
     auto images = images_t.tensor<T, 4>();
     auto transform = transform_t.matrix<float>();
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
-    auto output = output_t->tensor<T, 4>();
+
     (FillProjectiveTransform<Device, T>(interpolation_))(
         ctx->eigen_device<Device>(), &output, images, transform);
   }
@@ -129,10 +147,11 @@
 
 }  // end namespace functor
 
-#define REGISTER(TYPE)                                        \
-  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransform")    \
-                              .Device(DEVICE_GPU)             \
-                              .TypeConstraint<TYPE>("dtype"), \
+#define REGISTER(TYPE)                                       \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransform")   \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<TYPE>("dtype") \
+                              .HostMemory("output_shape"),   \
                           ImageProjectiveTransform<GPUDevice, TYPE>)
 
 TF_CALL_uint8(REGISTER);
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 209aa24..6b63eed 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -167,7 +167,7 @@
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    output->device(device) = images.generate(
+    output->device(device) = output->generate(
         ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index e59f1bf..4969ac5 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -19,23 +19,66 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+
+// Sets output[0] to shape [batch_dim,height,width,channel_dim], where
+// height and width come from the size_tensor.
+Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
+                             int size_input_idx, DimensionHandle channel_dim) {
+  // Verify shape of size input.
+  ShapeHandle size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused));
+
+  // Get size values from the size tensor.
+  const Tensor* size_tensor = c->input_tensor(size_input_idx);
+  DimensionHandle width;
+  DimensionHandle height;
+  if (size_tensor == nullptr) {
+    width = c->UnknownDim();
+    height = c->UnknownDim();
+  } else {
+    // TODO(petewarden) - Remove once we have constant evaluation in C++ only.
+    if (size_tensor->dtype() != DT_INT32) {
+      return errors::InvalidArgument(
+          "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 "
+          "but got ",
+          DataTypeString(size_tensor->dtype()), " for input #", size_input_idx,
+          " in ", c->DebugString());
+    }
+    auto vec = size_tensor->vec<int32>();
+    height = c->MakeDim(vec(0));
+    width = c->MakeDim(vec(1));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim}));
+  return Status::OK();
+}
+
+// TODO(qyu): Move this to core/framework/common_shape_fns.h
+Status ResizeShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+  return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                               c->Dim(input, 3));
+}
+
+}  // namespace
+
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
-// implement "same" and "valid" modes in the Python function.
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
+    .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float16, float32, float64}")
     .Attr("interpolation: string")
     .Output("transformed_images: dtype")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
+    .SetShapeFn(ResizeShapeFn)
     .Doc(R"doc(
 Applies the given transform to each of the images.
 
@@ -49,7 +92,7 @@
 the *output* point `(x, y)` to a transformed *input* point
 `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
 `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
-image, the output pixel is set to 0. The output is the same size as the input,
+image, the output pixel is set to 0.
 
 images: 4D `Tensor`, input image(s) in NHWC format.
 transforms: 2D `Tensor`, projective transform(s) to apply to the image(s).
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 62a22dc..f588eae 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import googletest
 
 _DTYPES = set(
@@ -194,6 +195,19 @@
            [0.0, 149, 233, 149, 0.0],
            [0.0, 0.0, 87., 0.0, 0.0]])
 
+  def test_rotate_static_shape(self):
+    image = array_ops.diag([1., 2., 3.])
+    result = image_ops.rotate(
+        image, random_ops.random_uniform((), -1, 1), interpolation="BILINEAR")
+    self.assertEqual(image.get_shape(), result.get_shape())
+
+  def test_transform_static_output_shape(self):
+    image = constant_op.constant([[1., 2.], [3., 4.]])
+    result = image_ops.transform(
+        image, random_ops.random_uniform([8], -1, 1),
+        output_shape=constant_op.constant([3, 5]))
+    self.assertAllEqual([3, 5], result.get_shape())
+
   def _test_grad(self, shape_to_test):
     with self.test_session():
       test_image_shape = shape_to_test
@@ -213,10 +227,40 @@
           x_init_value=test_image)
       self.assertLess(left_err, 1e-10)
 
+  def _test_grad_different_shape(self, input_shape, output_shape):
+    with self.test_session():
+      test_image_shape = input_shape
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      if len(output_shape) == 2:
+        resize_shape = output_shape
+      elif len(output_shape) == 3:
+        resize_shape = output_shape[0:2]
+      elif len(output_shape) == 4:
+        resize_shape = output_shape[1:3]
+      output = image_ops.transform(
+          images=test_image_tensor,
+          transforms=test_transform,
+          output_shape=resize_shape)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
   def test_grad(self):
     self._test_grad([16, 16])
     self._test_grad([4, 12, 12])
     self._test_grad([3, 4, 12, 12])
+    self._test_grad_different_shape([16, 16], [8, 8])
+    self._test_grad_different_shape([4, 12, 3], [8, 24, 3])
+    self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3])
 
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
index 1939caa..3054128 100644
--- a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
@@ -226,6 +227,81 @@
           interp_val = sess.run(interpolator)
           self.assertAllClose(interp_val[0, :, 0], target_interpolation)
 
+  def test_nd_linear_interpolation_unspecified_shape(self):
+    """Ensure that interpolation supports dynamic batch_size and num_points."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    # Construct placeholders such that the batch size, number of train points,
+    # and number of query points are not known at graph construction time.
+    feature_dim = query_points.shape[-1]
+    value_dim = train_values.shape[-1]
+    train_points_ph = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, feature_dim])
+    train_values_ph = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, value_dim])
+    query_points_ph = array_ops.placeholder(
+        dtype=query_points.dtype, shape=[None, None, feature_dim])
+
+    order = 1
+    reg_weight = 0.01
+
+    interpolator = interpolate_spline.interpolate_spline(
+        train_points_ph, train_values_ph, query_points_ph, order, reg_weight)
+
+    target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+    target_interpolation = np.array(target_interpolation)
+    with self.test_session() as sess:
+
+      (train_points_value, train_values_value, query_points_value) = sess.run(
+          [train_points, train_values, query_points])
+
+      interp_val = sess.run(
+          interpolator,
+          feed_dict={
+              train_points_ph: train_points_value,
+              train_values_ph: train_values_value,
+              query_points_ph: query_points_value
+          })
+      self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_fully_unspecified_shape(self):
+    """Ensure that erreor is thrown when input/output dim unspecified."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    # Construct placeholders such that the batch size, number of train points,
+    # and number of query points are not known at graph construction time.
+    feature_dim = query_points.shape[-1]
+    value_dim = train_values.shape[-1]
+    train_points_ph = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, feature_dim])
+    train_points_ph_invalid = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, None])
+    train_values_ph = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, value_dim])
+    train_values_ph_invalid = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, None])
+    query_points_ph = array_ops.placeholder(
+        dtype=query_points.dtype, shape=[None, None, feature_dim])
+
+    order = 1
+    reg_weight = 0.01
+
+    with self.assertRaises(ValueError):
+      _ = interpolate_spline.interpolate_spline(
+          train_points_ph_invalid, train_values_ph, query_points_ph, order,
+          reg_weight)
+
+    with self.assertRaises(ValueError):
+      _ = interpolate_spline.interpolate_spline(
+          train_points_ph, train_values_ph_invalid, query_points_ph, order,
+          reg_weight)
+
   def test_interpolation_gradient(self):
     """Make sure that backprop can run. Correctness of gradients is assumed.
 
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 86b0ffe..e7a0904 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -23,6 +23,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -40,6 +41,9 @@
 ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
+# TODO(ringwalt): Support a "reshape" (name used by SciPy) or "expand" (name
+# used by PIL, maybe more readable) mode, which determines the correct
+# output_shape and translation for the transform.
 def rotate(images, angles, interpolation="NEAREST", name=None):
   """Rotate image(s) counterclockwise by the passed angle(s) in radians.
 
@@ -213,7 +217,11 @@
         axis=1)
 
 
-def transform(images, transforms, interpolation="NEAREST", name=None):
+def transform(images,
+              transforms,
+              interpolation="NEAREST",
+              output_shape=None,
+              name=None):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -230,6 +238,10 @@
        the transform mapping input points to output points. Note that gradients
        are not backpropagated into transformation parameters.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    output_shape: Output dimesion after the transform, [height, width].
+       If None, output is the same size as input image.
+
+    name: The name of the op.
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -238,6 +250,7 @@
 
   Raises:
     TypeError: If `image` is an invalid type.
+    ValueError: If output shape is not 1-D int32 Tensor.
   """
   with ops.name_scope(name, "transform"):
     image_or_images = ops.convert_to_tensor(images, name="images")
@@ -256,6 +269,17 @@
     else:
       raise TypeError("Images should have rank between 2 and 4.")
 
+    if output_shape is None:
+      output_shape = tensor_util.constant_value(
+          array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3]
+
+    output_shape = ops.convert_to_tensor(
+        output_shape, dtypes.int32, name="output_shape")
+
+    if not output_shape.get_shape().is_compatible_with([2]):
+      raise ValueError("output_shape must be a 1-D Tensor of 2 elements: "
+                       "new_height, new_width")
+
     if len(transform_or_transforms.get_shape()) == 1:
       transforms = transform_or_transforms[None]
     elif transform_or_transforms.get_shape().ndims is None:
@@ -265,8 +289,12 @@
       transforms = transform_or_transforms
     else:
       raise TypeError("Transforms should have rank 1 or 2.")
+
     output = gen_image_ops.image_projective_transform(
-        images, transforms, interpolation=interpolation.upper())
+        images,
+        output_shape=output_shape,
+        transforms=transforms,
+        interpolation=interpolation.upper())
     if len(image_or_images.get_shape()) == 2:
       return output[0, :, :, 0]
     elif len(image_or_images.get_shape()) == 3:
@@ -376,14 +404,6 @@
 
   if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
     raise TypeError("Invalid dtype %s." % image_or_images.dtype)
-  if len(image_or_images.get_shape()) == 2:
-    images = image_or_images[None, :, :, None]
-  elif len(image_or_images.get_shape()) == 3:
-    images = image_or_images[None, :, :, :]
-  elif len(image_or_images.get_shape()) == 4:
-    images = image_or_images
-  else:
-    raise TypeError("Images should have rank between 2 and 4")
   if len(transform_or_transforms.get_shape()) == 1:
     transforms = transform_or_transforms[None]
   elif len(transform_or_transforms.get_shape()) == 2:
@@ -396,13 +416,11 @@
   inverse = linalg_ops.matrix_inverse(transforms)
   transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
-      grad, transforms, interpolation=interpolation)
-  if len(image_or_images.get_shape()) == 2:
-    return [output[0, :, :, 0], None]
-  elif len(image_or_images.get_shape()) == 3:
-    return [output[0, :, :, :], None]
-  else:
-    return [output, None]
+      images=grad,
+      transforms=transforms,
+      output_shape=array_ops.shape(image_or_images)[1:3],
+      interpolation=interpolation)
+  return [output, None, None]
 
 
 def bipartite_match(distance_mat,
diff --git a/tensorflow/contrib/image/python/ops/interpolate_spline.py b/tensorflow/contrib/image/python/ops/interpolate_spline.py
index daf8c56..f0b408f 100644
--- a/tensorflow/contrib/image/python/ops/interpolate_spline.py
+++ b/tensorflow/contrib/image/python/ops/interpolate_spline.py
@@ -17,9 +17,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -95,10 +92,22 @@
   Returns:
     w: `[b, n, k]` weights on each interpolation center
     v: `[b, d, k]` weights on each input dimension
+  Raises:
+    ValueError: if d or k is not fully specified.
   """
 
-  b, n, d = train_points.get_shape().as_list()
-  _, _, k = train_values.get_shape().as_list()
+  # These dimensions are set dynamically at runtime.
+  b, n, _ = array_ops.unstack(array_ops.shape(train_points), num=3)
+
+  d = train_points.shape[-1]
+  if d.value is None:
+    raise ValueError('The dimensionality of the input points (d) must be '
+                     'statically-inferrable.')
+
+  k = train_values.shape[-1]
+  if k.value is None:
+    raise ValueError('The dimensionality of the output values (k) must be '
+                     'statically-inferrable.')
 
   # First, rename variables so that the notation (c, f, w, v, A, B, etc.)
   # follows https://en.wikipedia.org/wiki/Polyharmonic_spline.
@@ -113,14 +122,12 @@
 
     matrix_a = _phi(_pairwise_squared_distance_matrix(c), order)  # [b, n, n]
     if regularization_weight > 0:
-      batch_identity_matrix = np.expand_dims(np.eye(n), 0)
-      batch_identity_matrix = constant_op.constant(
-          batch_identity_matrix, dtype=train_points.dtype)
-
+      batch_identity_matrix = array_ops.expand_dims(
+          linalg_ops.eye(n, dtype=c.dtype), 0)
       matrix_a += regularization_weight * batch_identity_matrix
 
     # Append ones to the feature values for the bias term in the linear model.
-    ones = array_ops.ones([b, n, 1], train_points.dtype)
+    ones = array_ops.ones_like(c[..., :1], dtype=c.dtype)
     matrix_b = array_ops.concat([c, ones], 2)  # [b, n, d + 1]
 
     # [b, n + d + 1, n]
@@ -164,9 +171,6 @@
     Polyharmonic interpolation evaluated at points defined in query_points.
   """
 
-  batch_size = train_points.get_shape()[0].value
-  num_query_points = query_points.get_shape()[1].value
-
   # First, compute the contribution from the rbf term.
   pairwise_dists = _cross_squared_distance_matrix(query_points, train_points)
   phi_pairwise_dists = _phi(pairwise_dists, order)
@@ -177,7 +181,7 @@
   # Pad query_points with ones, for the bias term in the linear model.
   query_points_pad = array_ops.concat([
       query_points,
-      array_ops.ones([batch_size, num_query_points, 1], train_points.dtype)
+      array_ops.ones_like(query_points[..., :1], train_points.dtype)
   ], 2)
   linear_term = math_ops.matmul(query_points_pad, v)
 
@@ -251,6 +255,9 @@
   Note the interpolation procedure is differentiable with respect to all inputs
   besides the order parameter.
 
+  We support dynamically-shaped inputs, where batch_size, n, and m are None
+  at graph construction time. However, d and k must be known.
+
   Args:
     train_points: `[batch_size, n, d]` float `Tensor` of n d-dimensional
       locations. These do not need to be regularly-spaced.
diff --git a/tensorflow/contrib/image/python/ops/sparse_image_warp.py b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
index 54a215d..1ea8f70 100644
--- a/tensorflow/contrib/image/python/ops/sparse_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
@@ -112,10 +112,10 @@
   Apply a non-linear warp to the image, where the warp is specified by
   the source and destination locations of a (potentially small) number of
   control points. First, we use a polyharmonic spline
-  (@{tf.contrib.image.interpolate_spline}) to interpolate the displacements
+  (`tf.contrib.image.interpolate_spline`) to interpolate the displacements
   between the corresponding control points to a dense flow field.
   Then, we warp the image using this dense flow field
-  (@{tf.contrib.image.dense_image_warp}).
+  (`tf.contrib.image.dense_image_warp`).
 
   Let t index our control points. For regularization_weight=0, we have:
   warped_image[b, dest_control_point_locations[b, t, 0],
@@ -126,7 +126,7 @@
   For regularization_weight > 0, this condition is met approximately, since
   regularized interpolation trades off smoothness of the interpolant vs.
   reconstruction of the interpolant at the control points.
-  See @{tf.contrib.image.interpolate_spline} for further documentation of the
+  See `tf.contrib.image.interpolate_spline` for further documentation of the
   interpolation_order and regularization_weight arguments.
 
 
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 694f0c1..3c37f15 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -15,7 +15,9 @@
 
 """Integration and ODE solvers.
 
-See the @{$python/contrib.integrate} guide.
+See the
+[Contrib Integrate](https://tensorflow.org/api_guides/python/contrib.integrate)
+guide.
 
 @@odeint
 @@odeint_fixed
diff --git a/tensorflow/contrib/integrate/python/ops/odes.py b/tensorflow/contrib/integrate/python/ops/odes.py
index 61f78fe..7b7ac4f 100644
--- a/tensorflow/contrib/integrate/python/ops/odes.py
+++ b/tensorflow/contrib/integrate/python/ops/odes.py
@@ -73,7 +73,7 @@
     # _possibly_nonzero lets us avoid wasted computation.
     return math_ops.add_n(
         [(scale * x) * y for x, y in zip(xs, ys)
-         if _possibly_nonzero(x) or _possibly_nonzero(y)],
+         if _possibly_nonzero(x) and _possibly_nonzero(y)],
         name=scope)
 
 
@@ -122,7 +122,7 @@
       yi = y0 + _scaled_dot_product(dt_cast, beta_i, k)
       k.append(func(yi, ti))
 
-    if not (tableau.c_sol[-1] == 0 and tableau.c_sol == tableau.beta[-1]):
+    if not (tableau.c_sol[-1] == 0 and tableau.c_sol[:-1] == tableau.beta[-1]):
       # This property (true for Dormand-Prince) lets us save a few FLOPs.
       yi = y0 + _scaled_dot_product(dt_cast, tableau.c_sol, k)
 
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index 2638b25..d0ea961 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -15,7 +15,7 @@
 
 #include "tensorflow/core/framework/dataset.h"
 
-#include "src-cpp/rdkafkacpp.h"
+#include "rdkafkacpp.h"
 
 namespace tensorflow {
 
@@ -52,12 +52,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, std::vector<string> topics,
             const string& servers, const string& group, const bool eof,
             const int64 timeout)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           topics_(std::move(topics)),
           servers_(servers),
           group_(group),
@@ -84,7 +84,8 @@
     string DebugString() const override { return "KafkaDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* topics = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
diff --git a/tensorflow/contrib/keras/__init__.py b/tensorflow/contrib/keras/__init__.py
index a162f0c..cecf1dd 100644
--- a/tensorflow/contrib/keras/__init__.py
+++ b/tensorflow/contrib/keras/__init__.py
@@ -15,7 +15,7 @@
 # ==============================================================================
 """Implementation of the Keras API meant to be a high-level API for TensorFlow.
 
-This module an alias for @{tf.keras}, for backwards compatibility.
+This module an alias for `tf.keras`, for backwards compatibility.
 
 Detailed documentation and user guides are also available at
 [keras.io](https://keras.io).
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
index 1f9e82b..cb649a3 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
@@ -18,10 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.preprocessing.image import apply_transform
 from tensorflow.python.keras.preprocessing.image import array_to_img
 from tensorflow.python.keras.preprocessing.image import DirectoryIterator
-from tensorflow.python.keras.preprocessing.image import flip_axis
 from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
 from tensorflow.python.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras.preprocessing.image import Iterator
diff --git a/tensorflow/contrib/kernel_methods/README.md b/tensorflow/contrib/kernel_methods/README.md
index 44ed967..1bce327 100644
--- a/tensorflow/contrib/kernel_methods/README.md
+++ b/tensorflow/contrib/kernel_methods/README.md
@@ -21,13 +21,15 @@
 output. More mappers are on the way.
 
 ## Kernel-based Estimators
-These are estimators inheriting from the @{tf.contrib.learn.Estimator} class and
-use kernel mappers internally to discover non-linearities in the data. These
-canned estimators map their input features using kernel mapper Ops and then
-apply linear models to the mapped features. Combining kernel mappers with linear
-models and different loss functions leads to a variety of models: linear and
-non-linear SVMs, linear regression (with and without kernels) and (multinomial)
-logistic regression (with and without kernels).
+
+These estimators inherit from the
+[`tf.contrib.learn.Estimator`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/estimator.py)
+class and use kernel mappers internally to discover non-linearities in the
+data. These canned estimators map their input features using kernel mapper
+Ops and then apply linear models to the mapped features. Combining kernel
+mappers with linear models and different loss functions leads to a variety of
+models: linear and non-linear SVMs, linear regression (with and without
+kernels) and (multinomial) logistic regression (with and without kernels).
 
 Currently there is a [KernelLinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/kernel_estimators.py) implemented but more pre-packaged estimators
 are on the way.
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index d6b1a61..44e01e1 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -202,7 +202,7 @@
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
-    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
+    device: string, Either '/cpu:0' or '/gpu:0'. The covariance and inverse
       update ops are run on this device.
     session_config: None or tf.ConfigProto. Configuration for tf.Session().
 
@@ -470,7 +470,7 @@
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     use_fake_data: bool. If True, generate a synthetic dataset.
-    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
+    device: string, Either '/cpu:0' or '/gpu:0'. The covariance and inverse
       update ops are run on this device.
 
   Returns:
@@ -509,7 +509,7 @@
     num_epochs: int. Number of passes to make over the training set.
     num_towers: int. Number of CPUs to split inference across.
     use_fake_data: bool. If True, generate a synthetic dataset.
-    devices: string, Either list of CPU or GPU. The covaraince and inverse
+    devices: string, Either list of CPU or GPU. The covariance and inverse
       update ops are run on this device.
 
   Returns:
@@ -621,7 +621,7 @@
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     op_strategy: `string`, Strategy to run the covariance and inverse
-      ops. If op_strategy == `chief_worker` then covaraiance and inverse
+      ops. If op_strategy == `chief_worker` then covariance and inverse
       update ops are run on chief worker otherwise they are run on dedicated
       workers.
 
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index 854f885..323234c 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -97,8 +97,8 @@
           and to regularize the update direction by making it closer to the
           gradient. (Higher damping means the update looks more like a standard
           gradient update - see Tikhonov regularization.)
-      layer_collection: The layer collection object, which holds the fisher
-          blocks, kronecker factors, and losses associated with the
+      layer_collection: The layer collection object, which holds the Fisher
+          blocks, Kronecker factors, and losses associated with the
           graph.
       exps: List of floats or ints. These represent the different matrix
           powers of the approximate Fisher that the FisherEstimator will be able
@@ -464,7 +464,7 @@
 
   def _get_grads_lists_empirical(self, tensors):
     # Passing in a list of loss values is better than passing in the sum as
-    # the latter creates unnessesary ops on the default device
+    # the latter creates unnecessary ops on the default device
     grads_flat = gradients_impl.gradients(
         self._layers.eval_losses(),
         nest.flatten(tensors),
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 3a5c8eb..9fa6eb7 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -870,7 +870,7 @@
   Estimates the Fisher Information matrix's blog for a convolutional
   layer.
 
-  Consider a convoluational layer in this model with (unshared) filter matrix
+  Consider a convolutional layer in this model with (unshared) filter matrix
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index b43232d..afa2fd1 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -71,15 +71,15 @@
 # factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True.
 _INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5
 
-# If True, then subsamples the tensor passed to compute the covaraince matrix.
+# If True, then subsamples the tensor passed to compute the covariance matrix.
 _SUB_SAMPLE_OUTER_PRODUCTS = False
 
-# If True, then subsamples the tensor passed to compute the covaraince matrix.
+# If True, then subsamples the tensor passed to compute the covariance matrix.
 _SUB_SAMPLE_INPUTS = False
 
 # TOWER_STRATEGY can be one of "concat" or "separate".  If "concat", the data
 # passed to the factors from the blocks will be concatenated across towers
-# (lazilly via PartitionedTensor objects).  Otherwise a tuple of tensors over
+# (lazily via PartitionedTensor objects).  Otherwise a tuple of tensors over
 # towers will be passed in, and the factors will iterate over this and do the
 # cov computations separately for each one, averaging the results together.
 TOWER_STRATEGY = "concat"
@@ -309,7 +309,7 @@
 
 
 def _random_tensor_gather(array, max_size):
-  """Generates a random set of indices and gathers the value at the indcices.
+  """Generates a random set of indices and gathers the value at the indices.
 
   Args:
     array: Tensor, of shape `[batch_size, dim_2]`.
@@ -1762,8 +1762,8 @@
         # Might need to enforce symmetry lost due to numerical issues.
         invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
 
-        # The following line imposses the symmetry assumed by "Option 1" on C1.
-        # Stangely the code can work okay with this line commented out,
+        # The following line imposes the symmetry assumed by "Option 1" on C1.
+        # Strangely the code can work okay with this line commented out,
         # depending on how psd_eig is defined.  I'm not sure why.
         C1 = (C1 + array_ops.transpose(C1)) / 2.0
 
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index cbbfe72..43aa713 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -609,7 +609,7 @@
                                outputs,
                                approx=None,
                                reuse=VARIABLE_SCOPE):
-    """Registers a fully connnected layer.
+    """Registers a fully connected layer.
 
     Args:
       params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
@@ -975,7 +975,7 @@
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
         word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        as it pertains to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
@@ -1045,7 +1045,7 @@
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
         word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        as it pertains to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
@@ -1116,7 +1116,7 @@
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
         word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
+        as it pertains to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index 42d525c..c8cebc4 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -214,7 +214,7 @@
 
     Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
     product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
+    probability distribution (whose log-prob defines the loss). Typically this
     will be block-diagonal across different cases in the batch, since the
     distribution is usually (but not always) conditionally iid across different
     cases.
@@ -238,7 +238,7 @@
 
     Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
     product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
+    probability distribution (whose log-prob defines the loss). Typically this
     will be block-diagonal across different cases in the batch, since the
     distribution is usually (but not always) conditionally iid across different
     cases.
@@ -262,7 +262,7 @@
 
     Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
     product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
+    probability distribution (whose log-prob defines the loss). Typically this
     will be block-diagonal across different cases in the batch, since the
     distribution is usually (but not always) conditionally iid across different
     cases.
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 03b9da7..3860525 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -72,7 +72,7 @@
           (Higher damping means the update looks more like a standard gradient
           update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
-          blocks, kronecker factors, and losses associated with the
+          blocks, Kronecker factors, and losses associated with the
           graph.  The layer_collection cannot be modified after KfacOptimizer's
           initialization.
       var_list: Optional list or tuple of variables to train. Defaults to the
@@ -99,7 +99,7 @@
       placement_strategy: string, Device placement strategy used when creating
         covariance variables, covariance ops, and inverse ops.
         (Default: `None`)
-      **kwargs: Arguments to be passesd to specific placement
+      **kwargs: Arguments to be passed to specific placement
         strategy mixin. Check `placement.RoundRobinPlacementMixin` for example.
 
     Raises:
@@ -120,7 +120,7 @@
     self._estimation_mode = estimation_mode
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
 
-    # The below parameters are required only if damping needs to be adapated.
+    # The below parameters are required only if damping needs to be adapted.
     # These parameters can be set by calling
     # set_damping_adaptation_params() explicitly.
     self._damping_adaptation_decay = 0.95
@@ -574,7 +574,7 @@
     """Wrapper function for `self._compute_qmodel_hyperparams`.
 
     Constructs a list of preconditioned gradients and variables. Also creates a
-    op to asssign the computed q model change to `self._q_model_change`.
+    op to assign the computed q model change to `self._q_model_change`.
 
     Args:
       grads_and_vars: List of (gradient, variable) pairs.
diff --git a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
index 3212279..95c7001 100644
--- a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
+++ b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
@@ -164,11 +164,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const string& stream, const string& shard,
             const bool read_indefinitely, const int64 interval)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           stream_(stream),
           shard_(shard),
           read_indefinitely_(read_indefinitely),
@@ -194,7 +194,8 @@
     string DebugString() const override { return "KinesisDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* stream = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(stream_, &stream));
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index a7b41b7..af8e673 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Ops for building neural network layers, regularizers, summaries, etc.
 
-See the @{$python/contrib.layers} guide.
+See the
+[Contrib Layers](https://tensorflow.org/api_guides/python/contrib.layers)
+guide.
 
 @@avg_pool2d
 @@avg_pool3d
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 51610f2..655f038 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -47,7 +47,7 @@
   Args:
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-          @{tf.set_random_seed} for behavior.
+          `tf.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
@@ -98,7 +98,7 @@
     mode: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-          @{tf.set_random_seed} for behavior.
+          `tf.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
@@ -111,7 +111,7 @@
   if not dtype.is_floating:
     raise TypeError('Cannot create initializer for non-floating point type.')
   if mode not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG']:
-    raise TypeError('Unknow mode %s [FAN_IN, FAN_OUT, FAN_AVG]', mode)
+    raise TypeError('Unknown mode %s [FAN_IN, FAN_OUT, FAN_AVG]', mode)
 
   # pylint: disable=unused-argument
   def _initializer(shape, dtype=dtype, partition_info=None):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 6250f88..04668f1 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1584,7 +1584,7 @@
     outputs_collections: Collection to add the outputs.
     scope: Optional scope for name_scope.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
 
   Returns:
     A tensor representing the output of the operation.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index c5c7269..51c7abb 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1189,7 +1189,7 @@
       result = sess.run(horz_gradients)
       expected = np.zeros((1, 10, 9, 1))
 
-      self.assertAllEqual(result, expected)
+      self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
   def testHorzConvWithBlankImageAndPlaceholder(self):
     image = array_ops.placeholder(dtypes.float32, shape=(None, None, None, 1))
@@ -1209,7 +1209,7 @@
           })
       expected = np.zeros((1, 10, 9, 1))
 
-      self.assertAllEqual(result, expected)
+      self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
   def testHorzConvWithRandomImageMultiBatch(self):
     np.random.seed(1)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index b56a886..418b0cf 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -79,16 +79,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python:weights_broadcast_ops",
-        "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:export_output",
-        "//tensorflow/python/estimator:inputs",
-        "//tensorflow/python/estimator:inputs_queues",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:pandas_io",
-        "//tensorflow/python/estimator:run_config",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/ops/losses",
@@ -117,7 +108,6 @@
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -171,9 +161,8 @@
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:estimator_py",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -220,7 +209,7 @@
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -245,7 +234,7 @@
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -259,7 +248,7 @@
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -600,7 +589,6 @@
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
@@ -621,7 +609,7 @@
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/saved_model:signature_constants",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 79bd73f..28a6f5a 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -19,7 +19,8 @@
 [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
 for migration instructions.
 
-See the @{$python/contrib.learn} guide.
+See the [Contrib Learn](https://tensorflow.org/api_guides/python/contrib.learn)
+guide.
 
 @@BaseEstimator
 @@Estimator
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 66ebcfd..21f7dcc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -15,9 +15,9 @@
 """Implementation of k-means clustering on top of `Estimator` API (deprecated).
 
 This module is deprecated. Please use
-@{tf.contrib.factorization.KMeansClustering} instead of
-@{tf.contrib.learn.KMeansClustering}. It has a similar interface, but uses the
-@{tf.estimator.Estimator} API instead of @{tf.contrib.learn.Estimator}.
+`tf.contrib.factorization.KMeansClustering` instead of
+`tf.contrib.learn.KMeansClustering`. It has a similar interface, but uses the
+`tf.estimator.Estimator` API instead of `tf.contrib.learn.Estimator`.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index c36879e0..08f23aa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -221,7 +221,7 @@
 class RunConfig(ClusterConfig, core_run_config.RunConfig):
   """This class specifies the configurations for an `Estimator` run.
 
-  This class is a deprecated implementation of @{tf.estimator.RunConfig}
+  This class is a deprecated implementation of `tf.estimator.RunConfig`
   interface.
   """
   _USE_DEFAULT = 0
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 08e907a..4e64efd 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -162,16 +162,16 @@
 
     Args:
       estimator: Object implementing Estimator interface, which could be a
-        combination of @{tf.contrib.learn.Trainable} and
-        @{tf.contrib.learn.Evaluable} (deprecated), or
-        @{tf.estimator.Estimator}.
+        combination of `tf.contrib.learn.Trainable` and
+        `tf.contrib.learn.Evaluable` (deprecated), or
+        `tf.estimator.Estimator`.
       train_input_fn: function, returns features and labels for training.
       eval_input_fn: function, returns features and labels for evaluation. If
         `eval_steps` is `None`, this should be configured only to produce for a
         finite number of batches (generally, 1 epoch over the evaluation data).
       eval_metrics: `dict` of string, metric function. If `None`, default set
         is used. This should be `None` if the `estimator` is
-        @{tf.estimator.Estimator}. If metrics are provided they will be
+        `tf.estimator.Estimator`. If metrics are provided they will be
         *appended* to the default set.
       train_steps: Perform this many steps of training. `None`, the default,
         means train forever.
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 66af683..4f22054 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -415,7 +415,7 @@
       `InputFnOps`.
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Must be `None` if the estimator inherits from @{tf.estimator.Estimator}
+      Must be `None` if the estimator inherits from `tf.estimator.Estimator`
       or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
@@ -453,7 +453,7 @@
       The string path to the exported directory.
 
     Raises:
-      ValueError: If `estimator` is a @{tf.estimator.Estimator} instance
+      ValueError: If `estimator` is a `tf.estimator.Estimator` instance
         and `default_output_alternative_key` was specified.
     """
     if isinstance(estimator, core_estimator.Estimator):
@@ -504,7 +504,7 @@
       that must be provided at serving time (excluding labels!).
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Must be `None` if the estimator inherits from @{tf.estimator.Estimator}
+      Must be `None` if the estimator inherits from `tf.estimator.Estimator`
       or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
@@ -767,7 +767,7 @@
       The string path to the SavedModel indicated by post_export_fn.
 
     Raises:
-      ValueError: If `estimator` is a @{tf.estimator.Estimator} instance
+      ValueError: If `estimator` is a `tf.estimator.Estimator` instance
         and `default_output_alternative_key` was specified or if post_export_fn
         does not return a valid directory.
       RuntimeError: If unable to create temporary or final export directory.
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index a262a09..cbe4c03 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -14,7 +14,8 @@
 # ==============================================================================
 """Linear algebra libraries.
 
-See the @{$python/contrib.linalg} guide.
+See the[Contrib Linalg](https://tensorflow.org/api_guides/python/contrib.linalg)
+guide.
 
 @@LinearOperator
 @@LinearOperatorBlockDiag
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 9872c6f..8ebe45d8 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -158,7 +158,7 @@
           # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
           check_rank_op = control_flow_ops.Assert(
               math_ops.less_equal(array_ops.rank(transformed_tensor), 2),
-              ['transformed_tensor shouls have rank at most 2.'])
+              ['transformed_tensor should have rank at most 2.'])
           # Reshape to [batch_size, dense_column_dimension].
           with ops.control_dependencies([check_rank_op]):
             transformed_tensor = array_ops.reshape(transformed_tensor, [
@@ -172,7 +172,7 @@
         elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
           # A bucketized column corresponds to a sparse feature in SDCA. The
           # bucketized feature is "sparsified" for SDCA by converting it to a
-          # SparseFeatureColumn respresenting the one-hot encoding of the
+          # SparseFeatureColumn representing the one-hot encoding of the
           # bucketized feature.
           #
           # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
@@ -220,7 +220,7 @@
           # occur multiple times for a single example.
           projected_ids = projection_length * example_ids + flat_ids
 
-          # Remove any redudant ids.
+          # Remove any redundant ids.
           ids, idx = array_ops.unique(projected_ids)
           # Keep only one example id per duplicated ids.
           example_ids_filtered = math_ops.unsorted_segment_min(
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 3f15885..05d0b45 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -227,6 +227,8 @@
         "constant",
         "control_dep",
         "conv",
+        "conv_with_shared_weights",
+        "conv_to_depthwiseconv_with_shared_weights",
         "depthwiseconv",
         "div",
         "equal",
@@ -265,7 +267,7 @@
         "prelu",
         "pow",
         "reduce_max",
-        #"reduce_prod",  # disabled due to b/111823366
+        "reduce_prod",
         "relu",
         "relu1",
         "relu6",
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 5bc2010..c265e7c 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -49,7 +49,8 @@
 typedef enum {
   kTfLiteEigenContext = 0,     // include eigen_support.h to use.
   kTfLiteGemmLowpContext = 1,  // include gemm_support.h to use.
-  kTfLiteMaxExternalContexts = 2
+  kTfLiteEdgeTpuContext = 2,   // Placeholder for Edge TPU support.
+  kTfLiteMaxExternalContexts = 3
 } TfLiteExternalContextType;
 
 // An external context is a collection of information unrelated to the TF Lite
@@ -452,13 +453,15 @@
 
   // Copy the data from delegate buffer handle to raw memory.
   // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate,
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
+                                       TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
                                        void* data, size_t size);
 
   // Copy the data from raw memory to delegate buffer handle.
   // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate,
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
+                                     TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
                                      void* data, size_t size);
 
@@ -466,7 +469,7 @@
   // this doesn't release the underlying resource (e.g. textures). The
   // resources are either owned by application layer or the delegate.
   // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteDelegate* delegate,
+  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
 } TfLiteDelegate;
 
diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD
index 332a871..8abc828 100644
--- a/tensorflow/contrib/lite/delegates/eager/BUILD
+++ b/tensorflow/contrib/lite/delegates/eager/BUILD
@@ -7,6 +7,8 @@
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
 cc_library(
     name = "buffer_map",
     srcs = ["buffer_map.cc"],
@@ -14,21 +16,22 @@
     deps = [
         ":util",
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
-cc_test(
+tf_cc_test(
     name = "buffer_map_test",
     size = "small",
     srcs = ["buffer_map_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":buffer_map",
         "//tensorflow/contrib/lite:framework",
@@ -50,21 +53,23 @@
         ":buffer_map",
         ":delegate_data",
         ":kernel",
-        "//tensorflow/contrib/lite:framework",
+        ":util",
         "//tensorflow/contrib/lite:kernel_api",
         "//tensorflow/contrib/lite:util",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
-cc_test(
+tf_cc_test(
     name = "delegate_test",
     size = "small",
     srcs = ["delegate_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate",
         ":test_util",
@@ -79,20 +84,22 @@
     hdrs = ["delegate_data.h"],
     deps = [
         ":buffer_map",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/eager:context",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
-cc_test(
+tf_cc_test(
     name = "delegate_data_test",
     size = "small",
     srcs = ["delegate_data_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate_data",
         "//tensorflow/contrib/lite:framework",
@@ -109,25 +116,30 @@
     deps = [
         ":delegate_data",
         ":util",
-        "//tensorflow/contrib/lite:framework",
+        "@flatbuffers",
         "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite:string",
         "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:execute",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
-        "@flatbuffers",
-    ],
+    ] + select({
+        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
+        # set of core TensorFlow kernels. We may want to revisit this dependency
+        # to allow selective registration via build targets.
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
-cc_test(
+tf_cc_test(
     name = "kernel_test",
     size = "small",
     srcs = ["kernel_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate_data",
         ":kernel",
@@ -143,6 +155,7 @@
     hdrs = ["test_util.h"],
     deps = [
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/contrib/lite:string",
         "//tensorflow/contrib/lite/kernels:test_util",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
@@ -155,30 +168,26 @@
     hdrs = ["util.h"],
     deps = [
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:framework",
+        ],
+    }),
 )
 
-cc_test(
+tf_cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":util",
+        "//tensorflow/contrib/lite:string",
         "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
 )
-
-cc_library(
-    name = "constants",
-    hdrs = ["constants.h"],
-)
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc b/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc
index dcb3f6c..a046943 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc
@@ -56,8 +56,8 @@
   return buffer_map.GetTensor(0);
 }
 
-std::vector<int64> GetTensorShape(const tensorflow::Tensor& t) {
-  std::vector<int64> shape(t.dims());
+std::vector<tensorflow::int64> GetTensorShape(const tensorflow::Tensor& t) {
+  std::vector<tensorflow::int64> shape(t.dims());
   for (int i = 0; i < t.dims(); ++i) {
     shape[i] = t.dim_size(i);
   }
diff --git a/tensorflow/contrib/lite/delegates/eager/constants.h b/tensorflow/contrib/lite/delegates/eager/constants.h
deleted file mode 100644
index 7ed6ab7..0000000
--- a/tensorflow/contrib/lite/delegates/eager/constants.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
-
-namespace tflite {
-namespace eager {
-
-// The prefix of Eager op custom code.
-// This will be matched agains the `custom_code` field in `OperatorCode`
-// Flatbuffer Table.
-constexpr char kCustomCodePrefix[] = "Eager";
-
-}  // namespace eager
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.cc b/tensorflow/contrib/lite/delegates/eager/delegate.cc
index 673859d..45fc158 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate.cc
@@ -19,6 +19,7 @@
 #include "tensorflow/contrib/lite/context_util.h"
 #include "tensorflow/contrib/lite/delegates/eager/buffer_map.h"
 #include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+#include "tensorflow/contrib/lite/delegates/eager/util.h"
 #include "tensorflow/contrib/lite/util.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -27,7 +28,7 @@
 namespace delegate {
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  // Get the nodes in the current execution plan.
+  // Get the nodes in the current execution plan. Interpreter owns this array.
   TfLiteIntArray* plan;
   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
 
@@ -39,8 +40,7 @@
     TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
         context, node_index, &node, &registration));
 
-    if (registration->custom_name &&
-        strncmp(registration->custom_name, "Eager", 5) == 0) {
+    if (IsEagerOp(registration->custom_name)) {
       supported_nodes.push_back(node_index);
     }
   }
@@ -55,16 +55,15 @@
   return kTfLiteOk;
 }
 
-TfLiteStatus CopyFromBufferHandle(TfLiteDelegate* delegate,
+TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
                                   TfLiteBufferHandle buffer_handle, void* data,
                                   size_t size) {
-  // TODO(nupurgarg): Make BufferMap unique to each interpreter in order to
-  // support multiple interpreters using a single delegate.
   BufferMap* buffer_map =
-      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap();
+      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
 
   if (!buffer_map->HasTensor(buffer_handle)) {
-    fprintf(stderr, "Invalid tensor index %d.\n", buffer_handle);
+    context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
     return kTfLiteError;
   }
 
@@ -72,7 +71,8 @@
   tensorflow::StringPiece t_data = t.tensor_data();
 
   if (size != t_data.size()) {
-    fprintf(stderr, "Not enough space to store TensorFlow's aligned buffer.\n");
+    context->ReportError(
+        context, "Not enough space to store TensorFlow's aligned buffer.");
     return kTfLiteError;
   }
 
@@ -83,20 +83,26 @@
 }  // namespace delegate
 }  // namespace eager
 
-EagerDelegate::EagerDelegate() {
-  if (!eager::DelegateData::Create(&delegate_data_).ok()) {
+std::unique_ptr<EagerDelegate> EagerDelegate::Create() {
+  std::unique_ptr<eager::DelegateData> delegate_data;
+  if (!eager::DelegateData::Create(&delegate_data).ok()) {
     fprintf(stderr, "Unable to initialize TensorFlow context.\n");
-    return;
+    return nullptr;
   }
 
-  delegate_.reset(new TfLiteDelegate{
-      /*data_=*/delegate_data_.get(),
-      /*nullptr,*/ &eager::delegate::Prepare,
-      /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle,
-      /*CopyToBufferHandle=*/nullptr,
-      /*FreeBufferHandle=*/nullptr});
+  return std::unique_ptr<EagerDelegate>(
+      new EagerDelegate(std::move(delegate_data)));
 }
 
+EagerDelegate::EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data)
+    : TfLiteDelegate{
+          /*data_=*/delegate_data.get(),
+          /*nullptr,*/ &eager::delegate::Prepare,
+          /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle,
+          /*CopyToBufferHandle=*/nullptr,
+          /*FreeBufferHandle=*/nullptr},
+      delegate_data_(std::move(delegate_data)) {}
+
 EagerDelegate::~EagerDelegate() {}
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.h b/tensorflow/contrib/lite/delegates/eager/delegate.h
index 6259b35..6d15ba4 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.h
+++ b/tensorflow/contrib/lite/delegates/eager/delegate.h
@@ -17,7 +17,6 @@
 
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
 
 namespace tflite {
 
@@ -26,30 +25,33 @@
 // executed by TensorFlow's runtime via Eager.
 //
 // The interpreter must be constructed after the EagerDelegate and destructed
-// before the EagerDelegate. This delegate can only be used with one
-// interpreter.
+// before the EagerDelegate. This delegate may be used with multiple
+// interpreters, but it is *not* thread-safe.
 //
 // Usage:
-//   EagerDelegate delegate();
+//   auto delegate = EagerDelegate::Create();
 //   ... build interpreter ...
 //
-//   delegate.Apply(interpreter);
+//   if (delegate) {
+//     interpreter->ModifyGraphWithDelegate(
+//         delegate.get(), /*allow_dynamic_tensors=*/true);
+//   }
 //   ... run inference ...
 //   ... destroy interpreter ...
 //   ... destroy delegate ...
-class EagerDelegate {
+class EagerDelegate : public TfLiteDelegate {
  public:
-  EagerDelegate();
+  // Creates a delegate that supports TF ops.
+  //
+  // If the underyling TF Eager context creation fails, returns null.
+  static std::unique_ptr<EagerDelegate> Create();
+
   ~EagerDelegate();
 
-  TfLiteStatus Apply(Interpreter* interpreter) {
-    return interpreter->ModifyGraphWithDelegate(delegate_.get(),
-                                                /*allow_dynamic_tensors=*/true);
-  }
-
  private:
+  explicit EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data);
+
   std::unique_ptr<eager::DelegateData> delegate_data_;
-  std::unique_ptr<TfLiteDelegate> delegate_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.h b/tensorflow/contrib/lite/delegates/eager/delegate_data.h
index 8a0e8ba..772d26f 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.h
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data.h
@@ -32,14 +32,18 @@
   // The EagerContext that is required for execution of Eager Ops.
   tensorflow::EagerContext* GetEagerContext() { return eager_context_.get(); }
 
-  // Map from TF Lite tensor index to TensorFlow tensor.
-  BufferMap* GetBufferMap() { return &buffer_map_; }
+  // Map from TF Lite tensor index to TensorFlow tensor for a given context.
+  BufferMap* GetBufferMap(const TfLiteContext* context) {
+    return &buffer_map_[context];
+  }
 
  private:
   explicit DelegateData(tensorflow::EagerContext* eager_context);
 
   std::unique_ptr<tensorflow::EagerContext> eager_context_;
-  BufferMap buffer_map_;
+  // TODO(b/112439500): Clean up stale BufferMap instances after adding the
+  // necessary cleanup hook from a TfLiteContext to a TfLiteDelegate.
+  std::unordered_map<const TfLiteContext*, BufferMap> buffer_map_;
 };
 
 }  // namespace eager
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc b/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
index 30251b8..b3a0ffc 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
@@ -16,6 +16,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
@@ -29,8 +30,12 @@
   // binary.
   EXPECT_TRUE(DelegateData::Create(&data).ok());
 
+  TfLiteContext dummy_context1 = {};
+  TfLiteContext dummy_context2 = {};
   EXPECT_NE(data->GetEagerContext(), nullptr);
-  EXPECT_NE(data->GetBufferMap(), nullptr);
+  EXPECT_NE(data->GetBufferMap(&dummy_context1), nullptr);
+  EXPECT_NE(data->GetBufferMap(&dummy_context1),
+            data->GetBufferMap(&dummy_context2));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
index 88fb340..eb47f46 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
@@ -25,26 +25,24 @@
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
-// TODO(nupurgarg): Add a test with multiple interpreters for one delegate.
-
 class DelegateTest : public testing::EagerModelTest {
  public:
   DelegateTest() {
-    // The delegate needs to be constructed before the interpreter because the
-    // interpreter references data contained in the delegate.
-    delegate_.reset(new EagerDelegate());
+    delegate_ = EagerDelegate::Create();
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
   ~DelegateTest() override {
     // The delegate needs to be destructed after the interpreter because the
     // interpreter references data contained in the delegate.
-    delete interpreter_.release();
-    delete delegate_.release();
+    interpreter_.reset();
+    delegate_.reset();
   }
 
   void ConfigureDelegate() {
-    CHECK(delegate_->Apply(interpreter_.get()) == kTfLiteOk);
+    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(
+                  delegate_.get(), /*allow_dynamic_tensors=*/true),
+              kTfLiteOk);
   }
 
  private:
@@ -139,6 +137,56 @@
   ASSERT_THAT(GetValues(2), ElementsAre(1.1f, 4.4f, 9.9f, 17.6f));
 }
 
+TEST_F(DelegateTest, MultipleInterpretersSameDelegate) {
+  // Build a graph, configure the delegate and set inputs.
+  {
+    AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+    AddTfOp(testing::kUnpack, {0}, {1, 2});
+    AddTfOp(testing::kUnpack, {3}, {4, 5});
+    AddTfOp(testing::kAdd, {1, 4}, {6});
+    AddTfOp(testing::kAdd, {2, 5}, {7});
+    AddTfOp(testing::kMul, {6, 7}, {8});
+    ConfigureDelegate();
+    SetShape(0, {2, 2, 1});
+    SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+    SetShape(3, {2, 2, 1});
+    SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+  }
+
+  // Create a new interpreter, inject into the test framework and build
+  // a different graph using the *same* delegate.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter(&error_reporter_));
+  interpreter_.swap(interpreter);
+  {
+    AddTensors(10, {0}, {9}, kTfLiteFloat32, {3});
+    AddTfOp(testing::kUnpack, {0}, {1, 2});
+    AddTfOp(testing::kAdd, {1, 2}, {3});
+    AddTfOp(testing::kUnpack, {3}, {4, 5});
+    AddTfLiteMulOp({4, 5}, {6});
+    AddTfOp(testing::kUnpack, {6}, {7, 8});
+    AddTfOp(testing::kAdd, {7, 8}, {9});
+    ConfigureDelegate();
+    SetShape(0, {2, 2, 2, 1});
+    SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
+  }
+
+  // Swap back in the first interpreter and validate inference.
+  interpreter_.swap(interpreter);
+  {
+    ASSERT_TRUE(Invoke());
+    EXPECT_THAT(GetShape(8), ElementsAre(2, 1));
+    EXPECT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+  }
+
+  // Swap in the second interpreter and validate inference.
+  interpreter_.swap(interpreter);
+  {
+    ASSERT_TRUE(Invoke());
+    EXPECT_THAT(GetShape(9), ElementsAre(1));
+    EXPECT_THAT(GetValues(9), ElementsAre(10.0f));
+  }
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc
index 1727981..b8e3292 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel.cc
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc
@@ -14,13 +14,14 @@
 ==============================================================================*/
 #include "tensorflow/contrib/lite/delegates/eager/kernel.h"
 
-#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_ops.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/context_util.h"
 #include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
 #include "tensorflow/contrib/lite/delegates/eager/util.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/string.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
@@ -149,8 +150,8 @@
   op_data->eager_context =
       reinterpret_cast<DelegateData*>(params->delegate->data_)
           ->GetEagerContext();
-  op_data->buffer_map =
-      reinterpret_cast<DelegateData*>(params->delegate->data_)->GetBufferMap();
+  op_data->buffer_map = reinterpret_cast<DelegateData*>(params->delegate->data_)
+                            ->GetBufferMap(context);
 
   CHECK(params->output_tensors);
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
index b7bfbb3..66f2226 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
@@ -55,12 +55,14 @@
     delegate_.data_ = delegate_data_.get();
     delegate_.FreeBufferHandle = nullptr;
     delegate_.Prepare = prepare_function;
-    delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate,
+    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
                                         TfLiteBufferHandle buffer_handle,
                                         void* data, size_t size) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values =
-          delegate_data->GetBufferMap()->GetTensor(buffer_handle).tensor_data();
+      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
+                                           ->GetTensor(buffer_handle)
+                                           .tensor_data();
       memcpy(data, values.data(), values.size());
       return kTfLiteOk;
     };
diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.cc b/tensorflow/contrib/lite/delegates/eager/test_util.cc
index 80acf5d..203afa6 100644
--- a/tensorflow/contrib/lite/delegates/eager/test_util.cc
+++ b/tensorflow/contrib/lite/delegates/eager/test_util.cc
@@ -16,7 +16,8 @@
 #include "tensorflow/contrib/lite/delegates/eager/test_util.h"
 
 #include "absl/memory/memory.h"
-#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
+#include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
 namespace eager {
diff --git a/tensorflow/contrib/lite/delegates/eager/util_test.cc b/tensorflow/contrib/lite/delegates/eager/util_test.cc
index c4fbf54..53378a1 100644
--- a/tensorflow/contrib/lite/delegates/eager/util_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/util_test.cc
@@ -18,6 +18,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/string.h"
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/delegates/nnapi/BUILD b/tensorflow/contrib/lite/delegates/nnapi/BUILD
index 091f8fb..954955f 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/BUILD
+++ b/tensorflow/contrib/lite/delegates/nnapi/BUILD
@@ -22,7 +22,10 @@
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+        "noasan",  # TODO(b/112326936): re-enable for asan once fixed.
+    ],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/contrib/lite:framework",
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
index b1b8e98..e6cc3dd 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -27,7 +27,9 @@
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 
 #ifdef __ANDROID__
+#include <sys/mman.h>
 #include <sys/system_properties.h>
+#include <unistd.h>
 #endif
 
 namespace tflite {
@@ -80,6 +82,44 @@
   }
 };
 
+// Manage NNAPI shared memory handle
+class NNMemory {
+ public:
+  NNMemory(const char* name, size_t size) {
+#ifdef __ANDROID__
+    byte_size_ = size;
+    fd_ = ASharedMemory_create(name, size);
+    data_ptr_ = reinterpret_cast<uint8_t*>(
+        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
+    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
+                                       &nn_memory_handle_);
+#endif
+  }
+
+  ~NNMemory() {
+#ifdef __ANDROID__
+    if (data_ptr_) {
+      munmap(data_ptr_, byte_size_);
+    }
+    if (nn_memory_handle_) {
+      ANeuralNetworksMemory_free(nn_memory_handle_);
+    }
+    if (fd_ > 0) close(fd_);
+#endif
+  }
+
+  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
+  uint8_t* get_data_ptr() { return data_ptr_; }
+
+ private:
+#ifdef __ANDROID__
+  int fd_ = 0;
+  size_t byte_size_ = 0;
+#endif
+  uint8_t* data_ptr_ = nullptr;
+  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
+};  // namespace
+
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
  public:
@@ -326,15 +366,21 @@
   std::vector<uint32_t> augmented_outputs_;
 };
 
+struct NNAPIOpMappingArgs {
+  TfLiteContext* context;
+  NNAPIOpBuilder* builder;
+  TfLiteNode* node;
+  std::vector<int>* model_state_inputs;
+  std::vector<int>* model_state_tfl_outputs;
+};
+
 // The kernel that represents the subgraph of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
   NNAPIDelegateKernel() = default;
 
   typedef ANeuralNetworksOperationType (*MappingFn)(
-      TfLiteContext*, NNAPIOpBuilder* builder, TfLiteNode* node,
-      std::vector<int>* model_state_inputs,
-      std::vector<int>* model_state_tfl_outputs);
+      const NNAPIOpMappingArgs& mapping_args);
 
   // Return a function that knows how to translate a node into its operands
   // when called. You can use this function to see if a node is supported
@@ -344,13 +390,11 @@
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteAddParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_ADD;
           };
         } else {
@@ -359,13 +403,11 @@
         break;
       case kTfLiteBuiltinMul:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteMulParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_MUL;
           };
         } else {
@@ -374,11 +416,10 @@
         break;
       case kTfLiteBuiltinAveragePool2d:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            builder->AddPoolingParams(node->builtin_data);
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
             return ANEURALNETWORKS_AVERAGE_POOL_2D;
           };
         } else {
@@ -387,11 +428,10 @@
         break;
       case kTfLiteBuiltinMaxPool2d:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            builder->AddPoolingParams(node->builtin_data);
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
             return ANEURALNETWORKS_MAX_POOL_2D;
           };
         } else {
@@ -400,11 +440,10 @@
         break;
       case kTfLiteBuiltinL2Pool2d:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            builder->AddPoolingParams(node->builtin_data);
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
             return ANEURALNETWORKS_L2_POOL_2D;
           };
         } else {
@@ -420,16 +459,14 @@
             // NNAPI does not support dilated Conv2D.
             return nullptr;
           }
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->padding);
-            builder->AddScalarInt32Operand(builtin->stride_width);
-            builder->AddScalarInt32Operand(builtin->stride_height);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteConvParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_CONV_2D;
           };
         } else {
@@ -438,17 +475,16 @@
         break;
       case kTfLiteBuiltinDepthwiseConv2d:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(
-                node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->padding);
-            builder->AddScalarInt32Operand(builtin->stride_width);
-            builder->AddScalarInt32Operand(builtin->stride_height);
-            builder->AddScalarInt32Operand(builtin->depth_multiplier);
-            builder->AddScalarInt32Operand(builtin->activation);
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->depth_multiplier);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
           };
         } else {
@@ -457,13 +493,11 @@
         break;
       case kTfLiteBuiltinFullyConnected:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
-                node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_FULLY_CONNECTED;
           };
         } else {
@@ -472,13 +506,11 @@
         break;
       case kTfLiteBuiltinSoftmax:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-            builder->AddScalarFloat32Operand(builtin->beta);
+            auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_SOFTMAX;
           };
         } else {
@@ -487,9 +519,7 @@
         break;
       case kTfLiteBuiltinReshape:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_RESHAPE;
           };
@@ -499,15 +529,13 @@
         break;
       case kTfLiteBuiltinSqueeze:
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
+            auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
+                mapping_args.node->builtin_data);
             // Note that we add the squeeze dimensions even if the dimensions
             // were unspecified (empty), as NNAPI requires the operand.
-            builder->AddVectorInt32Operand(
+            mapping_args.builder->AddVectorInt32Operand(
                 builtin->squeeze_dims,
                 static_cast<uint32_t>(builtin->num_squeeze_dims));
             return ANEURALNETWORKS_SQUEEZE;
@@ -522,25 +550,21 @@
           // NNAPI does not support activations
           return nullptr;
         }
-        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                  TfLiteNode* node, std::vector<int>* model_state_inputs,
-                  std::vector<int>* model_state_tfl_outputs)
+        return [](const NNAPIOpMappingArgs& mapping_args)
                    -> ANeuralNetworksOperationType {
           return ANEURALNETWORKS_L2_NORMALIZATION;
         };
       }
       case kTfLiteBuiltinLocalResponseNormalization:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
-                node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->radius);
-            builder->AddScalarFloat32Operand(builtin->bias);
-            builder->AddScalarFloat32Operand(builtin->alpha);
-            builder->AddScalarFloat32Operand(builtin->beta);
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->radius);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->bias);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->alpha);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
           };
         } else {
@@ -556,13 +580,11 @@
                   ->type == kTfLiteLshProjectionSparse) {
             return nullptr;
           }
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
-                node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->type);
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->type);
             return ANEURALNETWORKS_LSH_PROJECTION;
           };
         } else {
@@ -585,13 +607,11 @@
               }
             }
           }
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
-                node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->axis);
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->axis);
             return ANEURALNETWORKS_CONCATENATION;
           };
         } else {
@@ -600,9 +620,7 @@
         break;
       case kTfLiteBuiltinDequantize:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_DEQUANTIZE;
           };
@@ -612,9 +630,7 @@
         break;
       case kTfLiteBuiltinFloor:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_FLOOR;
           };
@@ -624,9 +640,7 @@
         break;
       case kTfLiteBuiltinRelu:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_RELU;
           };
@@ -636,9 +650,7 @@
         break;
       case kTfLiteBuiltinReluN1To1:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_RELU1;
           };
@@ -648,9 +660,7 @@
         break;
       case kTfLiteBuiltinRelu6:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_RELU6;
           };
@@ -660,9 +670,7 @@
         break;
       case kTfLiteBuiltinLogistic:
         if (version == 1) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_LOGISTIC;
           };
@@ -675,9 +683,7 @@
         if (version == 1 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float tanh.
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_TANH;
           };
@@ -689,13 +695,11 @@
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float sub.
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteSubParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SUB;
           };
         } else {
@@ -706,13 +710,11 @@
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float div.
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteDivParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DIV;
           };
         } else {
@@ -726,9 +728,7 @@
           // NNAPI does not support specifying the padding value.
           // NNAPI pads physical zero for quantized tensors, so only delegate
           // float pad to NNAPI.
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_PAD;
           };
@@ -738,9 +738,7 @@
         break;
       case kTfLiteBuiltinSpaceToBatchNd:
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
           };
@@ -750,15 +748,14 @@
         break;
       case kTfLiteBuiltinStridedSlice:
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->begin_mask);
-            builder->AddScalarInt32Operand(builtin->end_mask);
-            builder->AddScalarInt32Operand(builtin->shrink_axis_mask);
+            auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask);
+            mapping_args.builder->AddScalarInt32Operand(builtin->end_mask);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->shrink_axis_mask);
             return ANEURALNETWORKS_STRIDED_SLICE;
           };
         } else {
@@ -774,9 +771,7 @@
             (node->inputs->size > 1) &&
             (context->tensors[node->inputs->data[1]].allocation_type ==
              kTfLiteMmapRo)) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_TRANSPOSE;
           };
@@ -790,20 +785,19 @@
         if (version == 1 &&
             context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type ==
                 kTfLiteFloat32) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             // NNAPI need both state_in and state_out.
             int ann_index;
-            builder->AddStateFloat32Tensor(
-                node->outputs->data[/*kHiddenStateTensor*/ 0], &ann_index);
-            model_state_inputs->push_back(ann_index);
-            model_state_tfl_outputs->push_back(
-                node->outputs->data[/*kHiddenStateTensor*/ 0]);
-            auto builtin =
-                reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0],
+                &ann_index);
+            mapping_args.model_state_inputs->push_back(ann_index);
+            mapping_args.model_state_tfl_outputs->push_back(
+                mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0]);
+            auto builtin = reinterpret_cast<TfLiteRNNParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_RNN;
           };
         } else {
@@ -815,22 +809,21 @@
         if (version == 1 &&
             context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
                     .type == kTfLiteFloat32) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             // NNAPI need both state_in and state_out.
             int ann_index;
-            builder->AddStateFloat32Tensor(
-                node->outputs->data[/*kStateTensor*/ 0], &ann_index);
-            model_state_inputs->push_back(ann_index);
-            model_state_tfl_outputs->push_back(
-                node->outputs->data[/*kStateTensor*/ 0]);
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->outputs->data[/*kStateTensor*/ 0],
+                &ann_index);
+            mapping_args.model_state_inputs->push_back(ann_index);
+            mapping_args.model_state_tfl_outputs->push_back(
+                mapping_args.node->outputs->data[/*kStateTensor*/ 0]);
 
-            auto builtin =
-                reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->rank);
-            builder->AddScalarInt32Operand(builtin->activation);
+            auto builtin = reinterpret_cast<TfLiteSVDFParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->rank);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SVDF;
           };
         } else {
@@ -844,33 +837,33 @@
             context->tensors[node->inputs
                                  ->data[/*kInputToOutputWeightsTensor*/ 4]]
                     .type == kTfLiteFloat32) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             // NNAPI need both state_in and state_out for cell_state and
             // output_state.
             int ann_index;
-            builder->AddStateFloat32Tensor(
-                node->outputs->data[/*kOutputStateTensor*/ 0], &ann_index);
-            model_state_inputs->push_back(ann_index);
-            model_state_tfl_outputs->push_back(
-                node->outputs->data[/*kOutputStateTensor*/ 0]);
-            builder->AddStateFloat32Tensor(
-                node->outputs->data[/*kCellStateTensor*/ 1], &ann_index);
-            model_state_inputs->push_back(ann_index);
-            model_state_tfl_outputs->push_back(
-                node->outputs->data[/*kCellStateTensor*/ 1]);
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0],
+                &ann_index);
+            mapping_args.model_state_inputs->push_back(ann_index);
+            mapping_args.model_state_tfl_outputs->push_back(
+                mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0]);
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->outputs->data[/*kCellStateTensor*/ 1],
+                &ann_index);
+            mapping_args.model_state_inputs->push_back(ann_index);
+            mapping_args.model_state_tfl_outputs->push_back(
+                mapping_args.node->outputs->data[/*kCellStateTensor*/ 1]);
 
-            auto builtin =
-                reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-            builder->AddScalarInt32Operand(builtin->activation);
-            builder->AddScalarFloat32Operand(builtin->cell_clip);
-            builder->AddScalarFloat32Operand(builtin->proj_clip);
+            auto builtin = reinterpret_cast<TfLiteLSTMParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
 
             // Current NNAPI implementation requires the sratch_buffer as
             // output.
-            builder->AddAdditionalFloat32OutputTensor(2);
+            mapping_args.builder->AddAdditionalFloat32OutputTensor(2);
             return ANEURALNETWORKS_LSTM;
           };
         } else {
@@ -882,15 +875,13 @@
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
             context->tensors[node->outputs->data[0]].dims->size > 0) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
-            auto builtin =
-                reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+            auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+                mapping_args.node->builtin_data);
             int32_t keep_dims = 0;
             if (builtin->keep_dims) keep_dims = 1;
-            builder->AddScalarInt32Operand(keep_dims);
+            mapping_args.builder->AddScalarInt32Operand(keep_dims);
             return ANEURALNETWORKS_MEAN;
           };
         } else {
@@ -900,9 +891,7 @@
         // NNAPI only support float32 values.
         if (version == 1 &&
             context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_EMBEDDING_LOOKUP;
           };
@@ -914,9 +903,7 @@
         // NNAPI only support float32 output.
         if (version == 1 &&
             context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
-          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
-                    TfLiteNode* node, std::vector<int>* model_state_inputs,
-                    std::vector<int>* model_state_tfl_outputs)
+          return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             return ANEURALNETWORKS_HASHTABLE_LOOKUP;
           };
@@ -964,6 +951,8 @@
     // absolute indices but NN api indices inputs by relative indices.
     int relative_input_index = 0;
     int num_optional_tensors = 0;
+
+    size_t input_offset = 0;
     for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
       if (absolute_input_index == kOptionalTensor) {
         num_optional_tensors++;
@@ -973,20 +962,28 @@
       // TODO(miaowang): make sure the delegation works with dequantized weights
       // as intermediate tensors.
       if (tensor->allocation_type != kTfLiteMmapRo) {
-        CHECK_NN(context, ANeuralNetworksExecution_setInput(
+        // copy data to pre-allocated shared memory.
+        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
+               tensor->data.raw, tensor->bytes);
+        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
                               execution, relative_input_index, nullptr,
-                              tensor->data.raw, tensor->bytes));
+                              nn_input_memory_->get_handle(), input_offset,
+                              tensor->bytes));
+        input_offset += tensor->bytes;
         relative_input_index++;
       }
     }
 
     // Set the output tensor buffers.
     int relative_output_index = 0;
+    size_t output_offset = 0;
     for (auto output_index : TfLiteIntArrayView(node->outputs)) {
       TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
+      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
                             execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
+                            nn_output_memory_->get_handle(), output_offset,
+                            tensor->bytes));
+      output_offset += tensor->bytes;
       relative_output_index++;
     }
 
@@ -1010,6 +1007,15 @@
     ANeuralNetworksEvent_free(event);
     ANeuralNetworksExecution_free(execution);
 
+    // copy results from shared memory to the destination.
+    output_offset = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      memcpy(tensor->data.raw,
+             nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
+      output_offset += tensor->bytes;
+    }
+
     return kTfLiteOk;
   }
 
@@ -1027,6 +1033,9 @@
   std::vector<int> model_state_inputs_;
   std::vector<int> model_state_tfl_outputs_;
 
+  std::unique_ptr<NNMemory> nn_input_memory_;
+  std::unique_ptr<NNMemory> nn_output_memory_;
+
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
     // The operand builder allows creating a single op. We create it at this
     // reduced power position rather than in the for loop to avoid reallocating
@@ -1054,9 +1063,9 @@
         }
       }
       // Get op type and operands
-      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
-          context, &builder, node, &model_state_inputs_,
-          &model_state_tfl_outputs_);
+      int nn_op_type = Map(context, reg->builtin_code, reg->version,
+                           node)({context, &builder, node, &model_state_inputs_,
+                                  &model_state_tfl_outputs_});
       // Map outputs to NN API tensor indices.
       for (auto output_index : TfLiteIntArrayView(node->outputs)) {
         TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
@@ -1077,21 +1086,27 @@
     inputs.reserve(input_tensors->size);
     std::vector<uint32_t> outputs;
     outputs.reserve(output_tensors->size);
+
+    size_t total_input_byte_size = 0;
     // Make the TensorFlow lite inputs and outputs to ann_indices.
     for (int i : TfLiteIntArrayView(input_tensors)) {
       // Constant tensors are not NNAPI inputs.
       if (i != kOptionalTensor &&
           context->tensors[i].allocation_type != kTfLiteMmapRo) {
         inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+        total_input_byte_size += context->tensors[i].bytes;
       }
     }
+
     // Add state input tensors as model inputs
     for (int i : model_state_inputs_) {
       inputs.push_back(i);
     }
 
+    size_t total_output_byte_size = 0;
     for (int i : TfLiteIntArrayView(output_tensors)) {
       outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+      total_output_byte_size += context->tensors[i].bytes;
     }
 
     // Tell ANN to declare inputs/outputs
@@ -1101,6 +1116,11 @@
     // Finalize the model
     CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
 
+    // Create shared memory pool for inputs and outputs.
+    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_output_memory_.reset(
+        new NNMemory("output_pool", total_output_byte_size));
+
     return kTfLiteOk;
   }
 };
diff --git a/tensorflow/contrib/lite/error_reporter.cc b/tensorflow/contrib/lite/error_reporter.cc
index 03fcd54..646913c 100644
--- a/tensorflow/contrib/lite/error_reporter.cc
+++ b/tensorflow/contrib/lite/error_reporter.cc
@@ -16,6 +16,10 @@
 #include <cstdarg>
 #include <cstdio>
 
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
 namespace tflite {
 
 ErrorReporter::~ErrorReporter() {}
@@ -39,6 +43,15 @@
 }
 
 int StderrReporter::Report(const char* format, va_list args) {
+#ifdef __ANDROID__
+  // On Android stderr is not captured for applications, only for code run from
+  // the shell. Rather than assume all users will set up a custom error
+  // reporter, let's output to logcat here
+  va_list args_for_log;
+  va_copy(args_for_log, args);
+  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
+  va_end(args_for_log);
+#endif
   const int result = vfprintf(stderr, format, args);
   fputc('\n', stderr);
   return result;
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index d74e275..734b15e 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -26,7 +26,7 @@
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #define LOG(x) std::cerr
 
@@ -315,7 +315,7 @@
   labelLayers = [[NSMutableArray alloc] init];
   oldPredictionValues = [[NSMutableDictionary alloc] init];
 
-  NSString* graph_path = FilePathForResourceName(model_file_name, @"tflite");
+  NSString* graph_path = FilePathForResourceName(model_file_name, model_file_type);
   model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
   if (!model) {
     LOG(FATAL) << "Failed to mmap model " << graph_path;
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
index cd8c390..8084307 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile
@@ -2,4 +2,4 @@
 inhibit_all_warnings!
 
 target 'tflite_camera_example'
-       pod 'TensorFlowLite', '0.1.7'
+       pod 'TensorFlowLite', '1.10.0'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
index c885398..eea7ecb 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile
@@ -2,4 +2,4 @@
 inhibit_all_warnings!
 
 target 'tflite_simple_example'
-       pod 'TensorFlowLite', '0.1.7'
+       pod 'TensorFlowLite', '1.10.0'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
index 0ab7aa2..650c73f 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
@@ -25,7 +25,7 @@
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/contrib/lite/examples/python/BUILD b/tensorflow/contrib/lite/examples/python/BUILD
new file mode 100644
index 0000000..d337c3d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/python/BUILD
@@ -0,0 +1,13 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+py_binary(
+    name = "label_image",
+    srcs = ["label_image.py"],
+    main = "label_image.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/lite/python:lite",
+    ],
+)
diff --git a/tensorflow/contrib/lite/examples/python/label_image.md b/tensorflow/contrib/lite/examples/python/label_image.md
new file mode 100644
index 0000000..e81192a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/python/label_image.md
@@ -0,0 +1,50 @@
+
+With model, input image (grace_hopper.bmp), and labels file (labels.txt)
+in /tmp.
+
+The example input image and labels file are from TensorFlow repo and
+MobileNet V1 model files.
+
+```
+curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp
+
+curl  https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
+mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
+
+```
+
+Run
+
+```
+curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp
+bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image
+```
+
+We can get results like
+
+```
+0.470588: military uniform
+0.337255: Windsor tie
+0.047059: bow tie
+0.031373: mortarboard
+0.019608: suit
+```
+
+Run
+
+```
+curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
+bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image \
+-- --model_file /tmp/mobilenet_v1_1.0_224.tflite
+```
+
+We can get results like
+```
+0.728693: military uniform
+0.116163: Windsor tie
+0.035517: bow tie
+0.014874: mortarboard
+0.011758: bolo tie
+```
+
+Check [models](../../g3doc/models.md) for models hosted by Google.
diff --git a/tensorflow/contrib/lite/examples/python/label_image.py b/tensorflow/contrib/lite/examples/python/label_image.py
new file mode 100644
index 0000000..282118a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/python/label_image.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""label_image for tflite"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import numpy as np
+
+from PIL import Image
+
+from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper
+
+def load_labels(filename):
+  my_labels = []
+  input_file = open(filename, 'r')
+  for l in input_file:
+    my_labels.append(l.strip())
+  return my_labels
+
+if __name__ == "__main__":
+  floating_model = False
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("-i", "--image", default="/tmp/grace_hopper.bmp", \
+    help="image to be classified")
+  parser.add_argument("-m", "--model_file", \
+    default="/tmp/mobilenet_v1_1.0_224_quant.tflite", \
+    help=".tflite model to be executed")
+  parser.add_argument("-l", "--label_file", default="/tmp/labels.txt", \
+    help="name of file containing labels")
+  parser.add_argument("--input_mean", default=127.5, help="input_mean")
+  parser.add_argument("--input_std", default=127.5, \
+    help="input standard deviation")
+  args = parser.parse_args()
+
+  interpreter = interpreter_wrapper.Interpreter(model_path=args.model_file)
+  interpreter.allocate_tensors()
+
+  input_details = interpreter.get_input_details()
+  output_details = interpreter.get_output_details()
+
+  # check the type of the input tensor
+  if input_details[0]['dtype'] == np.float32:
+    floating_model = True
+
+  # NxHxWxC, H:1, W:2
+  height = input_details[0]['shape'][1]
+  width = input_details[0]['shape'][2]
+  img = Image.open(args.image)
+  img = img.resize((width, height))
+
+  # add N dim
+  input_data = np.expand_dims(img, axis=0)
+
+  if floating_model:
+    input_data = (np.float32(input_data) - args.input_mean) / args.input_std
+
+  interpreter.set_tensor(input_details[0]['index'], input_data)
+
+  interpreter.invoke()
+
+  output_data = interpreter.get_tensor(output_details[0]['index'])
+  results = np.squeeze(output_data)
+
+  top_k = results.argsort()[-5:][::-1]
+  labels = load_labels(args.label_file)
+  for i in top_k:
+    if floating_model:
+      print('{0:08.6f}'.format(float(results[i]))+":", labels[i])
+    else:
+      print('{0:08.6f}'.format(float(results[i]/255.0))+":", labels[i])
diff --git a/tensorflow/contrib/lite/experimental/c/BUILD b/tensorflow/contrib/lite/experimental/c/BUILD
index 50f8da6..8fc07e8 100644
--- a/tensorflow/contrib/lite/experimental/c/BUILD
+++ b/tensorflow/contrib/lite/experimental/c/BUILD
@@ -26,17 +26,33 @@
     }),
     deps = [
         ":c_api",
+        ":c_api_experimental",
         ":exported_symbols.lds",
         ":version_script.lds",
     ],
 )
 
 cc_library(
+    name = "c_api_internal",
+    srcs = ["c_api.h"],
+    hdrs = ["c_api_internal.h"],
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/contrib/lite/experimental/c:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite:framework",
+    ],
+)
+
+cc_library(
     name = "c_api",
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
     copts = tflite_copts(),
     deps = [
+        ":c_api_internal",
         "//tensorflow/contrib/lite:context",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
@@ -44,6 +60,17 @@
     ],
 )
 
+cc_library(
+    name = "c_api_experimental",
+    srcs = ["c_api_experimental.cc"],
+    hdrs = ["c_api_experimental.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":c_api",
+        ":c_api_internal",
+    ],
+)
+
 cc_test(
     name = "c_api_test",
     size = "small",
@@ -51,9 +78,21 @@
     data = ["//tensorflow/contrib/lite:testdata/add.bin"],
     deps = [
         ":c_api",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = ["c_api_experimental_test.cc"],
+    data = ["//tensorflow/contrib/lite:testdata/add.bin"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.cc b/tensorflow/contrib/lite/experimental/c/c_api.cc
index 9d29e8b..a4ab0e8 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api.cc
+++ b/tensorflow/contrib/lite/experimental/c/c_api.cc
@@ -15,6 +15,7 @@
 #include "tensorflow/contrib/lite/experimental/c/c_api.h"
 
 #include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -23,28 +24,55 @@
 extern "C" {
 #endif  // __cplusplus
 
-struct _TFL_Interpreter {
-  std::unique_ptr<tflite::Interpreter> impl;
-};
-
 // LINT.IfChange
 
-TFL_Interpreter* TFL_NewInterpreter(const void* model_data,
-                                    int32_t model_size) {
+TFL_Model* TFL_NewModel(const void* model_data, size_t model_size) {
   auto model = tflite::FlatBufferModel::BuildFromBuffer(
-      static_cast<const char*>(model_data), static_cast<size_t>(model_size));
-  if (!model) {
+      static_cast<const char*>(model_data), model_size);
+  return model ? new TFL_Model{std::move(model)} : nullptr;
+}
+
+TFL_Model* TFL_NewModelFromFile(const char* model_path) {
+  auto model = tflite::FlatBufferModel::BuildFromFile(model_path);
+  return model ? new TFL_Model{std::move(model)} : nullptr;
+}
+
+void TFL_DeleteModel(TFL_Model* model) { delete model; }
+
+TFL_InterpreterOptions* TFL_NewInterpreterOptions() {
+  return new TFL_InterpreterOptions{};
+}
+
+void TFL_DeleteInterpreterOptions(TFL_InterpreterOptions* options) {
+  delete options;
+}
+
+void TFL_InterpreterOptionsSetNumThreads(TFL_InterpreterOptions* options,
+                                         int32_t num_threads) {
+  options->num_threads = num_threads;
+}
+
+TFL_Interpreter* TFL_NewInterpreter(
+    const TFL_Model* model, const TFL_InterpreterOptions* optional_options) {
+  if (!model || !model->impl) {
     return nullptr;
   }
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  tflite::InterpreterBuilder builder(*model, resolver);
-  std::unique_ptr<tflite::Interpreter> interpreter_impl;
-  if (builder(&interpreter_impl) != kTfLiteOk) {
+  tflite::InterpreterBuilder builder(*model->impl, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (builder(&interpreter) != kTfLiteOk) {
     return nullptr;
   }
 
-  return new TFL_Interpreter{std::move(interpreter_impl)};
+  if (optional_options) {
+    if (optional_options->num_threads !=
+        TFL_InterpreterOptions::kDefaultNumThreads) {
+      interpreter->SetNumThreads(optional_options->num_threads);
+    }
+  }
+
+  return new TFL_Interpreter{std::move(interpreter)};
 }
 
 void TFL_DeleteInterpreter(TFL_Interpreter* interpreter) { delete interpreter; }
@@ -97,9 +125,13 @@
 
 size_t TFL_TensorByteSize(const TFL_Tensor* tensor) { return tensor->bytes; }
 
+void* TFL_TensorData(const TFL_Tensor* tensor) {
+  return static_cast<void*>(tensor->data.raw);
+}
+
 TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
-                                    int32_t input_data_size) {
-  if (tensor->bytes != static_cast<size_t>(input_data_size)) {
+                                    size_t input_data_size) {
+  if (tensor->bytes != input_data_size) {
     return kTfLiteError;
   }
   memcpy(tensor->data.raw, input_data, input_data_size);
@@ -107,8 +139,8 @@
 }
 
 TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data,
-                                  int32_t output_data_size) {
-  if (tensor->bytes != static_cast<size_t>(output_data_size)) {
+                                  size_t output_data_size) {
+  if (tensor->bytes != output_data_size) {
     return kTfLiteError;
   }
   memcpy(output_data, tensor->data.raw, output_data_size);
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.h b/tensorflow/contrib/lite/experimental/c/c_api.h
index 070f1ad..3757349 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api.h
+++ b/tensorflow/contrib/lite/experimental/c/c_api.h
@@ -30,6 +30,9 @@
 //
 // Conventions:
 // * We use the prefix TFL_ for everything in the API.
+// * size_t is used to represent byte sizes of objects that are
+//   materialized in the address space of the calling process.
+// * int is used as an index into arrays.
 
 #ifdef SWIG
 #define TFL_CAPI_EXPORT
@@ -54,15 +57,50 @@
 typedef TfLiteType TFL_Type;
 
 // --------------------------------------------------------------------------
-// TFL_Interpreter provides inference from a provided model.
-typedef struct _TFL_Interpreter TFL_Interpreter;
+// TFL_Model wraps a loaded TensorFlow Lite model.
+typedef struct TFL_Model TFL_Model;
 
-// Returns an interpreter for the provided model, or null on failure.
+// Returns a model from the provided buffer, or null on failure.
+TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModel(const void* model_data,
+                                               size_t model_size);
+
+// Returns a model from the provided file, or null on failure.
+TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModelFromFile(const char* model_path);
+
+// Destroys the model instance.
+TFL_CAPI_EXPORT extern void TFL_DeleteModel(TFL_Model* model);
+
+// --------------------------------------------------------------------------
+// TFL_InterpreterOptions allows customized interpreter configuration.
+typedef struct TFL_InterpreterOptions TFL_InterpreterOptions;
+
+// Returns a new interpreter options instances.
+TFL_CAPI_EXPORT extern TFL_InterpreterOptions* TFL_NewInterpreterOptions();
+
+// Destroys the interpreter options instance.
+TFL_CAPI_EXPORT extern void TFL_DeleteInterpreterOptions(
+    TFL_InterpreterOptions* options);
+
+// Sets the number of CPU threads to use for the interpreter.
+TFL_CAPI_EXPORT extern void TFL_InterpreterOptionsSetNumThreads(
+    TFL_InterpreterOptions* options, int32_t num_threads);
+
+// --------------------------------------------------------------------------
+// TFL_Interpreter provides inference from a provided model.
+typedef struct TFL_Interpreter TFL_Interpreter;
+
+// Returns a new interpreter using the provided model and options, or null on
+// failure.
+//
+// * `model` must be a valid model instance. The caller retains ownership of the
+//   object, and can destroy it immediately after creating the interpreter.
+// * `optional_options` may be null. The caller retains ownership of the object,
+//   and can safely destroy it immediately after creating the interpreter.
 //
 // NOTE: The client *must* explicitly allocate tensors before attempting to
 // access input tensor data or invoke the interpreter.
 TFL_CAPI_EXPORT extern TFL_Interpreter* TFL_NewInterpreter(
-    const void* model_data, int32_t model_size);
+    const TFL_Model* model, const TFL_InterpreterOptions* optional_options);
 
 // Destroys the interpreter.
 TFL_CAPI_EXPORT extern void TFL_DeleteInterpreter(TFL_Interpreter* interpreter);
@@ -76,7 +114,8 @@
 TFL_CAPI_EXPORT extern TFL_Tensor* TFL_InterpreterGetInputTensor(
     const TFL_Interpreter* interpreter, int32_t input_index);
 
-// Attempts to resize the specified input tensor.
+// Resizes the specified input tensor.
+//
 // NOTE: After a resize, the client *must* explicitly allocate tensors before
 // attempting to access the resized tensor data or invoke the interpreter.
 // REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
@@ -131,16 +170,24 @@
 // Returns the size of the underlying data in bytes.
 TFL_CAPI_EXPORT extern size_t TFL_TensorByteSize(const TFL_Tensor* tensor);
 
+// Returns a pointer to the underlying data buffer.
+//
+// Note: The result may be null if tensors have not yet been allocated, e.g.,
+// if the Tensor has just been created or resized and `TFL_AllocateTensors()`
+// has yet to be called, or if the output tensor is dynamically sized and the
+// interpreter hasn't been invoked.
+TFL_CAPI_EXPORT extern void* TFL_TensorData(const TFL_Tensor* tensor);
+
 // Copies from the provided input buffer into the tensor's buffer.
 // REQUIRES: input_data_size == TFL_TensorByteSize(tensor)
 TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer(
-    TFL_Tensor* tensor, const void* input_data, int32_t input_data_size);
+    TFL_Tensor* tensor, const void* input_data, size_t input_data_size);
 
 // Copies to the provided output buffer from the tensor's buffer.
 // REQUIRES: output_data_size == TFL_TensorByteSize(tensor)
 TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer(
     const TFL_Tensor* output_tensor, void* output_data,
-    int32_t output_data_size);
+    size_t output_data_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
similarity index 62%
rename from tensorflow/compiler/xla/client/xla_client/xla_builder.h
rename to tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
index ce2a8af..c4dbc55 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
@@ -13,9 +13,19 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h"
 
-#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h"
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+TFL_Status TFL_InterpreterResetVariableTensorsToZero(
+    TFL_Interpreter* interpreter) {
+  return interpreter->impl->ResetVariableTensorsToZero();
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
new file mode 100644
index 0000000..b0ac258
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
+
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Resets all variable tensors to zero.
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResetVariableTensorsToZero(
+    TFL_Interpreter* interpreter);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
new file mode 100644
index 0000000..db6e525
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace {
+
+TEST(CApiExperimentalSimple, Smoke) {
+  TFL_Model* model = TFL_NewModelFromFile(
+      "tensorflow/contrib/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TFL_Interpreter* interpreter =
+      TFL_NewInterpreter(model, /*optional_options=*/nullptr);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  EXPECT_EQ(TFL_InterpreterResetVariableTensorsToZero(interpreter), kTfLiteOk);
+
+  TFL_DeleteModel(model);
+  TFL_DeleteInterpreter(interpreter);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_internal.h b/tensorflow/contrib/lite/experimental/c/c_api_internal.h
new file mode 100644
index 0000000..c5c612a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api_internal.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
+
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/model.h"
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+
+struct TFL_Model {
+  std::unique_ptr<tflite::FlatBufferModel> impl;
+};
+
+struct TFL_InterpreterOptions {
+  enum {
+    kDefaultNumThreads = -1,
+  };
+  int num_threads = kDefaultNumThreads;
+};
+
+struct TFL_Interpreter {
+  std::unique_ptr<tflite::Interpreter> impl;
+};
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
index bc925e0..a631dae 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_test.cc
+++ b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
@@ -18,22 +18,28 @@
 #include "tensorflow/contrib/lite/experimental/c/c_api.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace {
 
 TEST(CApiSimple, Smoke) {
-  tflite::FileCopyAllocation model_file(
-      "tensorflow/contrib/lite/testdata/add.bin",
-      tflite::DefaultErrorReporter());
+  TFL_Model* model = TFL_NewModelFromFile(
+      "tensorflow/contrib/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
 
-  TFL_Interpreter* interpreter =
-      TFL_NewInterpreter(model_file.base(), model_file.bytes());
+  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
+  ASSERT_NE(options, nullptr);
+  TFL_InterpreterOptionsSetNumThreads(options, 2);
+
+  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
   ASSERT_NE(interpreter, nullptr);
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
 
+  // The options/model can be deleted immediately after interpreter creation.
+  TFL_DeleteInterpreterOptions(options);
+  TFL_DeleteModel(model);
+
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
   ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
   ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
index ab966ba..b6905b5 100644
--- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -16,6 +16,8 @@
 using System.Runtime.InteropServices;
 
 using TFL_Interpreter = System.IntPtr;
+using TFL_InterpreterOptions = System.IntPtr;
+using TFL_Model = System.IntPtr;
 using TFL_Tensor = System.IntPtr;
 
 namespace TensorFlowLite
@@ -32,7 +34,9 @@
     public Interpreter(byte[] modelData) {
       GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
       IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
-      handle = TFL_NewInterpreter(modelDataPtr, modelData.Length);
+      TFL_Model model = TFL_NewModel(modelDataPtr, modelData.Length);
+      handle = TFL_NewInterpreter(model, /*options=*/IntPtr.Zero);
+      TFL_DeleteModel(model);
       if (handle == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
     }
 
@@ -89,9 +93,15 @@
     #region Externs
 
     [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Interpreter TFL_NewModel(IntPtr model_data, int model_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Interpreter TFL_DeleteModel(TFL_Model model);
+
+    [DllImport (TensorFlowLibrary)]
     private static extern unsafe TFL_Interpreter TFL_NewInterpreter(
-        IntPtr model_data,
-        int model_size);
+        TFL_Model model,
+        TFL_InterpreterOptions optional_options);
 
     [DllImport (TensorFlowLibrary)]
     private static extern unsafe void TFL_DeleteInterpreter(TFL_Interpreter interpreter);
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
index 834d1eb..b6c9a28 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
+++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include <vector>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
index 9d1e6a5..0da5532 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
+++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index cdc9172..9fcf79b 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -20,7 +20,7 @@
 ```bash
 ./tensorflow/contrib/lite/download_dependencies.sh
 ```
-Note than you only need to to this once.
+Note that you only need to do this once.
 
 You should then be able to compile:
 ```bash
@@ -42,7 +42,7 @@
 ```bash
 ./tensorflow/contrib/lite/download_dependencies.sh
 ```
-Note than you only need to to this once.
+Note that you only need to do this once.
 
 You should then be able to compile:
 ```bash
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 7a680f5..362e588 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -157,7 +157,7 @@
     TfLiteTensor* tensor = &context_.tensors[i];
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(tensor->delegate,
+      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
                                          &tensor->buffer_handle);
     }
     TfLiteTensorFree(tensor);
@@ -988,7 +988,7 @@
   tensor->delegate = delegate;
   if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
     TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(tensor->delegate,
+    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
                                        &tensor->buffer_handle);
   }
   tensor->buffer_handle = buffer_handle;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index be149a8..7d69aa2 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -165,7 +165,7 @@
     return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
                                        dims.data(), quantization, buffer, bytes,
                                        allocation);
-  };
+  }
 
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, const size_t rank,
@@ -350,7 +350,7 @@
       // This can be null if the delegate doesn't use its own buffer.
       TF_LITE_ENSURE(&context_,
                      tensor->delegate->CopyFromBufferHandle != nullptr);
-      tensor->delegate->CopyFromBufferHandle(tensor->delegate,
+      tensor->delegate->CopyFromBufferHandle(&context_, tensor->delegate,
                                              tensor->buffer_handle,
                                              tensor->data.raw, tensor->bytes);
       tensor->data_is_stale = false;
@@ -413,7 +413,12 @@
     return op_reg.profiling_string(&context_, node);
   }
 
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+
  private:
+  friend class InterpreterBuilder;
   friend class InterpreterTest;
 
   // Prevent 'context_' from accessing functions that are only available to
@@ -527,12 +532,13 @@
                                              TfLiteRegistration** registration);
 
   // WARNING: This is an experimental interface that is subject to change.
-  // Gets an TfLiteIntArray* representing the execution plan. The caller owns
-  // this memory and must free it with TfLiteIntArrayFree().
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
   TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
 
   // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get the execution plan
+  // Entry point for C node plugin API to get the execution plan.
   static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
                                        TfLiteIntArray** execution_plan);
 
@@ -542,12 +548,30 @@
       struct TfLiteContext* context, TfLiteExternalContextType type);
 
   // Set the value of an external context.
-  void SetExternalContext(TfLiteExternalContextType type,
-                          TfLiteExternalContext* ctx);
   static void SetExternalContext(struct TfLiteContext* context,
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
+  using TfLiteDelegatePtr =
+      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+  // Variant of the public ModifyGraphWithDelegate method that additionally
+  // Assumes ownership of the provided delegate.
+  // WARNING: This is an experimental API and subject to change.
+  template <typename Delegate>
+  TfLiteStatus ModifyGraphWithDelegate(std::unique_ptr<Delegate> typed_delegate,
+                                       bool allow_dynamic_tensors = false) {
+    TfLiteDelegatePtr delegate(typed_delegate.release(),
+                               [](TfLiteDelegate* delegate) {
+                                 delete static_cast<Delegate*>(delegate);
+                               });
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.push_back(std::move(delegate));
+    return ModifyGraphWithDelegate(owned_delegates_.back().get(),
+                                   allow_dynamic_tensors);
+  }
+
   // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
   // capacity. Calling this function may invalidate existing pointers to
   // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
@@ -627,6 +651,11 @@
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
+  // List of delegates that have been installed and are owned by this
+  // interpreter instance. Useful if client delegate ownership is burdensome.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<TfLiteDelegatePtr> owned_delegates_;
+
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
   bool allow_buffer_handle_output_ = false;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 2bf598b..5bcf092 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -26,6 +26,13 @@
 
 // InterpreterTest is a friend of Interpreter, so it can access context_.
 class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    return interpreter->ModifyGraphWithDelegate(std::move(delegate));
+  }
+
  protected:
   TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
 
@@ -1080,21 +1087,22 @@
         return kTfLiteOk;
       };
       delegate_.CopyToBufferHandle =
-          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
-             void* data, size_t size) -> TfLiteStatus {
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle, void* data,
+             size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
-          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
-             void* data, size_t size) -> TfLiteStatus {
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle, void* data,
+             size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
-      delegate_.FreeBufferHandle = [](TfLiteDelegate* delegate,
-                                      TfLiteBufferHandle* handle) {
-        *handle = kTfLiteNullBufferHandle;
-      };
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
       // Store type-punned data SimpleDelegate structure.
       delegate_.data_ = reinterpret_cast<void*>(this);
     }
@@ -1301,6 +1309,57 @@
   ASSERT_EQ(interpreter_->execution_plan()[0], 1);
 }
 
+TEST(TestDelegateOwnership, ProperlyDisposed) {
+  struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
+    TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
+        : destroyed(destroyed), prepared(prepared) {
+      Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
+        *static_cast<TfLiteInterpreterOwnedDelegate*>(delegate)->prepared =
+            true;
+        return kTfLiteOk;
+      };
+    }
+    ~TfLiteInterpreterOwnedDelegate() { *destroyed = true; }
+
+    bool* destroyed;
+    bool* prepared;
+  };
+
+  // Construct a delegate with flags for indicating preparation/destruction.
+  bool destroyed = false;
+  bool prepared = false;
+  std::unique_ptr<TfLiteInterpreterOwnedDelegate> delegate(
+      new TfLiteInterpreterOwnedDelegate(&destroyed, &prepared));
+  {
+    // Create an interpreter and assemble a simple graph.
+    Interpreter interpreter;
+    TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+    ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+    ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+    ASSERT_EQ(interpreter.SetOutputs({1}), kTfLiteOk);
+    ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                                &registration),
+              kTfLiteOk);
+
+    // Pass delegate ownership to that interpreter.
+    ASSERT_EQ(InterpreterTest::ModifyGraphWithDelegate(&interpreter,
+                                                       std::move(delegate)),
+              kTfLiteOk);
+
+    // The delegate should be prepared as normal, and should be preserved.
+    EXPECT_TRUE(prepared);
+    EXPECT_FALSE(destroyed);
+
+    // Interpreter interaction should not impact the delegate's validity.
+    interpreter.AllocateTensors();
+    interpreter.Invoke();
+    EXPECT_FALSE(destroyed);
+  }
+
+  // Only after the interpreter is destroyed should the delegate be destroyed.
+  EXPECT_TRUE(destroyed);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/java/demo/.gitignore b/tensorflow/contrib/lite/java/demo/.gitignore
index 39fb081..d245ab6 100644
--- a/tensorflow/contrib/lite/java/demo/.gitignore
+++ b/tensorflow/contrib/lite/java/demo/.gitignore
@@ -1,9 +1,29 @@
+# This file is based on https://github.com/github/gitignore/blob/master/Android.gitignore
 *.iml
+.idea/compiler.xml
+.idea/copyright
+.idea/dictionaries
+.idea/gradle.xml
+.idea/libraries
+.idea/inspectionProfiles
+.idea/misc.xml
+.idea/modules.xml
+.idea/runConfigurations.xml
+.idea/tasks.xml
+.idea/workspace.xml
 .gradle
-/local.properties
-/.idea/workspace.xml
-/.idea/libraries
+local.properties
 .DS_Store
-/build
+build/
+gradleBuild/
+*.apk
+*.ap_
+*.dex
+*.class
+bin/
+gen/
+out/
+*.log
+.navigation/
 /captures
 .externalNativeBuild
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
index c23521c..38b7400 100644
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
@@ -66,6 +66,25 @@
   }
 
   /**
+   * Gets the string name of the data type of an input.
+   *
+   * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @param index an integer index of the input. If it is invalid, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @return string name of the data type. Possible values include "float", "int", "byte", and
+   *     "long".
+   */
+  public static String getInputDataType(Interpreter interpreter, int index) {
+    if (interpreter != null && interpreter.wrapper != null) {
+      return interpreter.wrapper.getInputTensor(index).dataType().toStringName();
+    } else {
+      throw new IllegalArgumentException(
+          "Interpreter has not initialized;" + " Failed to get input data type.");
+    }
+  }
+
+  /**
    * Gets the string name of the data type of an output.
    *
    * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index c558647..1f528fd 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -225,6 +225,7 @@
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite:util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/kernels/internal:audio_utils",
         "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 817266a..d6d6258 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -40,6 +40,11 @@
   int diff_min = 0;
 };
 
+struct LogSoftmaxOpData : public OpData {
+  int32_t reverse_scaling_divisor = 0;
+  int32_t reverse_scaling_right_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -47,10 +52,19 @@
   return new OpData;
 }
 
+void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
+                     size_t length) {
+  return new LogSoftmaxOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
+void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -205,6 +219,34 @@
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  LogSoftmaxOpData* data = reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+    TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
+
+    static const double kBeta = 1.0;
+    static const int kScaledDiffIntegerBits = 5;
+    tflite::PreprocessLogSoftmaxScalingExp(
+        kBeta, input->params.scale, kScaledDiffIntegerBits,
+        &data->input_multiplier, &data->input_left_shift,
+        &data->reverse_scaling_divisor, &data->reverse_scaling_right_shift);
+    data->reverse_scaling_right_shift *= -1;
+    data->diff_min = -1.0 * tflite::CalculateInputRadius(
+                                kScaledDiffIntegerBits, data->input_left_shift);
+  }
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -509,6 +551,8 @@
 }
 
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  const LogSoftmaxOpData* data =
+      reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
@@ -517,6 +561,14 @@
           GetTensorData<float>(input), GetTensorShape(input),
           GetTensorData<float>(output), GetTensorShape(output));
       return kTfLiteOk;
+    case kTfLiteUInt8:
+      optimized_ops::LogSoftmax(
+          GetTensorData<uint8_t>(input), GetTensorShape(input),
+          data->input_multiplier, data->input_left_shift,
+          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+          data->diff_min, GetTensorData<uint8_t>(output),
+          GetTensorShape(output));
+      return kTfLiteOk;
     default:
       context->ReportError(context, "Only float32 supported currently., got %d",
                            input->type);
@@ -590,9 +642,9 @@
 }
 
 TfLiteRegistration* Register_LOG_SOFTMAX() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::GenericPrepare,
-                                 activations::LogSoftmaxEval};
+  static TfLiteRegistration r = {
+      activations::LogSoftmaxInit, activations::LogSoftmaxFree,
+      activations::LogSoftmaxPrepare, activations::LogSoftmaxEval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index 083cdf7..e577e3a 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -471,6 +471,28 @@
                               })));
 }
 
+TEST(QuantizedActivationsOpTest, LogSoftmax) {
+  const float kLogSoftmaxQuantizedTolerance = 16 / 256.0;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOG_SOFTMAX,
+      /*input=*/{TensorType_UINT8, {2, 4}, -10, 10},
+      /*output=*/{TensorType_UINT8, {}, 0, 0, 16. / 256, 255});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -4.14297, -10.14297, -2.14297, -.142971,    //
+                      -7.00104, -12.00104, -.00104087, -9.00104,  //
+                  },
+                  kLogSoftmaxQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
+}
+
 class PReluOpModel : public SingleOpModel {
  public:
   PReluOpModel(const TensorData& input, const TensorData& alpha) {
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
index 91d8dd3..fbbe172 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -22,7 +22,7 @@
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
index 8d460fd..b1e5f4f 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index ad211e9..605a20a 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -57,7 +57,9 @@
   TF_LITE_ENSURE(context, t0->dims->size <= 4);
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
   TF_LITE_ENSURE(context,
-                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8);
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+                     input_type == kTfLiteInt16 || input_type == kTfLiteInt32 ||
+                     input_type == kTfLiteInt64);
 
   // Output dimensions will match input dimensions, except 'axis', which
   // will be the sum of inputs
@@ -121,6 +123,13 @@
         TF_LITE_CONCATENATION(optimized_ops, float);
       }
       break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_CONCATENATION(reference_ops, int32);
+      } else {
+        TF_LITE_CONCATENATION(optimized_ops, int32);
+      }
+      break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
         TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
@@ -128,6 +137,14 @@
         TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_CONCATENATION(reference_ops, int64_t);
+      } else {
+        TF_LITE_CONCATENATION(optimized_ops, int64_t);
+      }
+      break;
+
     default:
       context->ReportError(context,
                            "Only float32 and uint8 are currently supported.");
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 04c0263..50fe5c2 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -334,18 +334,31 @@
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  switch (kernel_type) {
+  KernelType effective_kernel_type;
+  if ((kernel_type == kMultithreadOptimized ||
+       kernel_type == kCblasOptimized) &&
+      (params->dilation_width_factor != 1 ||
+       params->dilation_height_factor != 1)) {
+    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // Therefore, fallback to optimized.
+    effective_kernel_type = kGenericOptimized;
+  } else {
+    effective_kernel_type = kernel_type;
+  }
+
+  switch (effective_kernel_type) {
     case kReference:
       reference_ops::Conv(
           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
           GetTensorData<int32_t>(bias), GetTensorDims(bias),
-          params->stride_width, params->stride_height, data->padding.width,
-          data->padding.height, output_offset, data->output_multiplier,
-          data->output_shift, data->output_activation_min,
-          data->output_activation_max, GetTensorData<uint8_t>(output),
-          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
-          GetTensorDims(im2col), gemm_context);
+          params->stride_width, params->stride_height,
+          params->dilation_width_factor, params->dilation_height_factor,
+          data->padding.width, data->padding.height, output_offset,
+          data->output_multiplier, data->output_shift,
+          data->output_activation_min, data->output_activation_max,
+          GetTensorData<uint8_t>(output), GetTensorDims(output),
+          GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
       break;
     case kGenericOptimized:
     case kMultithreadOptimized:
@@ -355,12 +368,13 @@
           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
           GetTensorData<int32_t>(bias), GetTensorDims(bias),
-          params->stride_width, params->stride_height, data->padding.width,
-          data->padding.height, output_offset, data->output_multiplier,
-          data->output_shift, data->output_activation_min,
-          data->output_activation_max, GetTensorData<uint8_t>(output),
-          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
-          GetTensorDims(im2col), gemm_context);
+          params->stride_width, params->stride_height,
+          params->dilation_width_factor, params->dilation_height_factor,
+          data->padding.width, data->padding.height, output_offset,
+          data->output_multiplier, data->output_shift,
+          data->output_activation_min, data->output_activation_max,
+          GetTensorData<uint8_t>(output), GetTensorDims(output),
+          GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
       break;
   }
 }
@@ -374,10 +388,10 @@
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   KernelType effective_kernel_type;
-  if (((kernel_type == kMultithreadOptimized) ||
-       (kernel_type == kCblasOptimized)) &&
-      ((params->dilation_width_factor != 1) ||
-       (params->dilation_height_factor != 1))) {
+  if ((kernel_type == kMultithreadOptimized ||
+       kernel_type == kCblasOptimized) &&
+      (params->dilation_width_factor != 1 ||
+       params->dilation_height_factor != 1)) {
     // kMultithreadOptimized and kCblasOptimized do not support dilation.
     // Therefore, fallback to optimized.
     effective_kernel_type = kGenericOptimized;
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 24633c2..9815204 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -370,6 +370,65 @@
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({312, 357}));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE, dilation_width_factor,
+      dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
 class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
@@ -500,6 +559,71 @@
                              }));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  QuantizedConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {image_batch_count, image_height, image_width, depth},
+       0,
+       255},
+      {TensorType_UINT8,
+       {depth, filter_size, filter_size, filter_count},
+       0,
+       255},
+      {TensorType_UINT8, {}, 0, 255}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE, dilation_width_factor,
+      dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
 INSTANTIATE_TEST_CASE_P(
     ConvolutionOpTest, ConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc
index 672b217..2b0f044 100644
--- a/tensorflow/contrib/lite/kernels/dequantize.cc
+++ b/tensorflow/contrib/lite/kernels/dequantize.cc
@@ -36,6 +36,21 @@
   TfLiteTensor* output;
 };
 
+struct OpData {
+  // This boolean value is only used when the input tensor is constant.
+  bool float_dequantized_weights_initialized;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData();
+  op_data->float_dequantized_weights_initialized = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -45,12 +60,22 @@
   TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8);
 
   op_context.output->type = kTfLiteFloat32;
+  // If the input tensor is constant, we can persist the dequantized value in
+  // the output tensor. Otherwise we run dequantize upon each eval.
+  if (IsConstantTensor(op_context.input)) {
+    op_context.output->allocation_type = kTfLiteArenaRwPersistent;
+  }
   return context->ResizeTensor(context, op_context.output,
                                TfLiteIntArrayCopy(op_context.input->dims));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   OpContext op_context(context, node);
+  if (IsConstantTensor(op_context.input) &&
+      op_data->float_dequantized_weights_initialized) {
+    return kTfLiteOk;
+  }
 
   auto zero_point = op_context.input->params.zero_point;
   auto scale = op_context.input->params.scale;
@@ -59,14 +84,19 @@
                             GetTensorDims(op_context.input), zero_point, scale,
                             GetTensorData<float>(op_context.output),
                             GetTensorDims(op_context.output));
+
+  if (IsConstantTensor(op_context.input)) {
+    op_data->float_dequantized_weights_initialized = true;
+  }
+
   return kTfLiteOk;
 }
 
 }  // namespace dequantize
 
 TfLiteRegistration* Register_DEQUANTIZE_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, dequantize::Prepare,
-                                 dequantize::Eval};
+  static TfLiteRegistration r = {dequantize::Init, dequantize::Free,
+                                 dequantize::Prepare, dequantize::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess.cc b/tensorflow/contrib/lite/kernels/detection_postprocess.cc
index d7bde0f..211d43a 100644
--- a/tensorflow/contrib/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/contrib/lite/kernels/detection_postprocess.cc
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <numeric>
 #include <vector>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
index 4e0f848..fe90e5d 100644
--- a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
@@ -17,7 +17,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index bc37060..eaf5a67 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -121,10 +121,9 @@
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
-    TF_LITE_ENSURE(context, real_multiplier < 1.0);
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
-    data->output_shift *= -1;
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index ec94905..08b4320 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -423,6 +423,37 @@
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
+      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
 void SimpleTestQuantizedInt16OutputCase(
     TfLiteRegistration* registration, int input_depth, int output_depth,
     int batches, FullyConnectedOptionsWeightsFormat weights_format) {
@@ -631,6 +662,37 @@
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
+      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
 INSTANTIATE_TEST_CASE_P(
     FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 0d42407..a97db6c 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -496,6 +496,7 @@
     hdrs = ["test_util.h"],
     deps = [
         ":types",
+        "//tensorflow/contrib/lite:string",
     ],
 )
 
@@ -538,7 +539,10 @@
 cc_test(
     name = "depthwiseconv_quantized_test",
     srcs = ["depthwiseconv_quantized_test.cc"],
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -576,6 +580,7 @@
         ":quantization_util",
         ":reference_base",
         ":test_util",
+        "//tensorflow/contrib/lite:string",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -595,6 +600,7 @@
         ":quantization_util",
         ":reference_base",
         ":test_util",
+        "//tensorflow/contrib/lite:string",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -606,6 +612,7 @@
     deps = [
         ":optimized_base",
         ":reference_base",
+        "//tensorflow/contrib/lite:string",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
index 7e9ff52..8963abb 100644
--- a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
@@ -29,8 +29,9 @@
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/string.h"
 
-namespace {
+namespace tflite {
 
 class NumberGenerator {
  public:
@@ -330,4 +331,4 @@
                              &generator_);
 }
 
-}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
index d2f1103..3624c20 100644
--- a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -27,6 +27,7 @@
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+#include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
index d550307..7f0676b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -30,11 +30,6 @@
 using reference_ops::Relu1;
 using reference_ops::Relu6;
 
-inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
-  return RuntimeShape(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
@@ -294,6 +289,37 @@
       output_data);
 }
 
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul4DSlow(
+      input1_data, input1_dims, input1_offset, input2_data, input2_dims,
+      input2_offset, output_offset, output_multiplier,
+      // This legacy version switches the sign of the output shift.
+      kReverseShift * output_shift,
+      // (Break to highlight preceding line.)
+      output_activation_min, output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
                         int stride_width, int stride_height, int pad_width,
                         int pad_height, int kwidth, int kheight,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 6adb879..ca02021 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -47,6 +47,7 @@
 using reference_ops::BroadcastGreaterEqual;
 using reference_ops::BroadcastLess;
 using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastMul4DSlow;
 using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
 using reference_ops::DepthConcatenation;
@@ -75,6 +76,11 @@
 // Used mainly to convert from old-style shifts (right) to new-style (left).
 static constexpr int kReverseShift = -1;
 
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen vector expression. The std::conditional here is to
 // construct the suitable Eigen type for the constness of the
@@ -893,6 +899,7 @@
   const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   static constexpr int kPeel = 4;
+  const bool shift_left = (output_shift <= 0);
   for (int k = 0; k < input_size; k += 64) {
     optimized_ops_preload_l1_stream(input_data + k);
   }
@@ -1004,11 +1011,17 @@
     int32x4_t bias_vec = vld1q_s32(bias_ptr);
     bias_ptr += 4;
     reduced = vaddq_s32(reduced, bias_vec);
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, output_shift);
+    if (shift_left) {
+      const int32 multiplier_power_of_two = 1 << -output_shift;
+      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    } else {
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, output_shift);
+    }
     // Add the output offset.
     const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
     reduced = vaddq_s32(reduced, output_offset_vec);
@@ -1971,12 +1984,12 @@
                  int32 input_offset, const uint8* filter_data,
                  const Dims<4>& filter_dims, int32 filter_offset,
                  const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
                  gemmlowp::GemmContext* gemm_context) {
   gemmlowp::ScopedProfilingLabel label("Conv/8bit");
 
@@ -1988,9 +2001,22 @@
   const Dims<4>* gemm_input_dims = nullptr;
   const int filter_width = ArraySize(filter_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    DilatedIm2col(input_data, input_dims, filter_dims, stride_width,
+                  stride_height, dilation_width_factor, dilation_height_factor,
+                  pad_width, pad_height, output_dims, input_zero_point,
+                  im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else if (need_im2col) {
     TFLITE_DCHECK(im2col_data);
     const int input_zero_point = -input_offset;
     TFLITE_DCHECK_GE(input_zero_point, 0);
@@ -2046,6 +2072,24 @@
       input_offset, output_pipeline);
 }
 
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
@@ -2897,66 +2941,223 @@
                output_dims);
 }
 
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
 
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 unclamped_result =
-              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                  input1_val * input2_val, output_multiplier,
-                                  kReverseShift * output_shift);
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, unclamped_result));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
+    auto p1 = vmull_s16(input2_val_low, input1_val_low);
+    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, -params.output_shift);
+    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+
+    const auto p1_narrowed = vmovn_s32(p1);
+    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+                               const uint8 broadcast_value,
+                               const uint8* input2_data, uint8* output_data) {
+  const int16 input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+
+    auto p1 = vmull_n_s16(input2_val_low, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, -params.output_shift);
+    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+
+    const auto p1_narrowed = vmovn_s32(p1);
+    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMulFivefold/8bit");
+
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          input1_data_ptr += y4;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          ++input1_data_ptr;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
 }
 
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
@@ -5376,31 +5577,53 @@
   }
 }
 
-template <typename T>
-inline void PadV2(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& left_paddings,
-                  const std::vector<int>& right_paddings, T* output_data,
-                  const Dims<4>& output_dims, const T pad_value) {
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Pad");
-  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
-  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  RuntimeShape ext_input_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  RuntimeShape ext_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
 
-  const int output_batch = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_depth = ArraySize(output_dims, 0);
+  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
+  // we can pad them to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[i] = op_params.right_padding[i];
+  }
 
-  const int left_b_padding = left_paddings[3];
-  const int left_h_padding = left_paddings[2];
-  const int left_w_padding = left_paddings[1];
-  const int left_d_padding = left_paddings[0];
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int output_depth = ext_output_shape.Dims(3);
 
-  const int right_b_padding = right_paddings[3];
-  const int right_h_padding = right_paddings[2];
-  const int right_w_padding = right_paddings[1];
-  const int right_d_padding = right_paddings[0];
+  const int left_b_padding = left_padding_copy[0];
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int left_d_padding = left_padding_copy[3];
 
-  const int input_depth = ArraySize(input_dims, 0);
+  const int right_b_padding = right_padding_copy[0];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+  const int right_d_padding = right_padding_copy[3];
+
+  const int input_depth = ext_input_shape.Dims(3);
+  // const T pad_value = ExtractFloatOrInt<T>(op_params.pad_value);
+  const T pad_value = *pad_value_ptr;
 
   if (left_b_padding != 0) {
     TypedMemset<T>(
@@ -5410,61 +5633,113 @@
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b),
+      TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0),
                      pad_value, left_h_padding * output_width * output_depth);
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b),
-                       pad_value, left_w_padding * output_depth);
+        TypedMemset<T>(
+            output_data + Offset(ext_output_shape, out_b, out_h, 0, 0),
+            pad_value, left_w_padding * output_depth);
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
           TypedMemset<T>(
-              output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+              output_data + Offset(ext_output_shape, out_b, out_h, out_w, 0),
               pad_value, left_d_padding);
         }
 
         T* out = output_data +
-                 Offset(output_dims, left_d_padding, out_w, out_h, out_b);
-        const T* in =
-            input_data + Offset(input_dims, 0, out_w - left_w_padding,
-                                out_h - left_h_padding, out_b - left_b_padding);
+                 Offset(ext_output_shape, out_b, out_h, out_w, left_d_padding);
+        const T* in = input_data +
+                      Offset(ext_input_shape, out_b - left_b_padding,
+                             out_h - left_h_padding, out_w - left_w_padding, 0);
         memcpy(out, in, input_depth * sizeof(T));
 
         if (right_d_padding != 0) {
           TypedMemset<T>(
-              output_data + Offset(output_dims, output_depth - right_d_padding,
-                                   out_w, out_h, out_b),
+              output_data + Offset(ext_output_shape, out_b, out_h, out_w,
+                                   output_depth - right_d_padding),
               pad_value, right_d_padding);
         }
       }
       if (right_w_padding != 0) {
-        TypedMemset<T>(
-            output_data + Offset(output_dims, 0, output_width - right_w_padding,
-                                 out_h, out_b),
-            pad_value, right_w_padding * output_depth);
+        TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_h,
+                                            output_width - right_w_padding, 0),
+                       pad_value, right_w_padding * output_depth);
       }
     }
     if (right_h_padding != 0) {
       TypedMemset<T>(
-          output_data +
-              Offset(output_dims, 0, 0, output_height - right_h_padding, out_b),
+          output_data + Offset(ext_output_shape, out_b,
+                               output_height - right_h_padding, 0, 0),
           pad_value, right_h_padding * output_width * output_depth);
     }
   }
   if (right_b_padding != 0) {
     TypedMemset<T>(
         output_data +
-            Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+            Offset(ext_output_shape, output_batch - right_b_padding, 0, 0, 0),
         pad_value,
         right_b_padding * output_height * output_width * output_depth);
   }
 }
 
-// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32 when, say, the first is uint8.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                int32* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  // SetFloatOrInt(pad_value, &op_params.pad_value);
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -5475,34 +5750,45 @@
            output_dims, converted_pad_value);
 }
 
+// Old Pad that only padded with 0.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
 }
 
 template <typename T>
-inline void Slice(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& begin, const std::vector<int>& size,
-                  T* output_data, const Dims<4>& output_dims) {
-  // TODO(dkalenichenko): This op only supports 4D tensors.
-  TFLITE_DCHECK_EQ(begin.size(), 4);
-  TFLITE_DCHECK_EQ(size.size(), 4);
-  const int start_b = begin[3];
-  const int stop_b =
-      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
-  const int start_h = begin[2];
-  const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
-  const int start_w = begin[1];
-  const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
-  const int start_d = begin[0];
-  const int stop_d =
-      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Slice");
+  RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+  TFLITE_DCHECK_LE(op_params.begin_count, 4);
+  TFLITE_DCHECK_LE(op_params.size_count, 4);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
+                         ? ext_shape.Dims(0) - start_b
+                         : start_b + op_params.size[0];
+  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+                         ? ext_shape.Dims(1) - start_h
+                         : start_h + op_params.size[size_count - 3];
+  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+                         ? ext_shape.Dims(2) - start_w
+                         : start_w + op_params.size[size_count - 2];
+  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+                         ? ext_shape.Dims(3) - start_d
+                         : start_d + op_params.size[size_count - 1];
 
   T* out_ptr = output_data;
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
@@ -5510,7 +5796,7 @@
       for (int in_w = start_w; in_w < stop_w; ++in_w) {
         const int len = stop_d - start_d;
         memcpy(out_ptr,
-               input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+               input_data + Offset(ext_shape, in_b, in_h, in_w, start_d),
                len * sizeof(T));
         out_ptr += len;
       }
@@ -5519,25 +5805,57 @@
 }
 
 template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  auto min_value = input2_data[0];
+  output_map.array() = input1_map.array().min(min_value);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  auto max_value = input2_data[0];
+  output_map.array() = input1_map.array().max(max_value);
+}
+
+template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  auto min_value = input2_data[0];
-  output_map.array() = input1_map.array().min(min_value);
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
 }
 
 template <typename T>
 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  auto max_value = input2_data[0];
-  output_map.array() = input1_map.array().max(max_value);
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
 }
 
 template <typename T>
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 94773b4..00fc3e9 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -130,22 +130,22 @@
 }
 
 TEST(QuantizationUtilTest, SafeCast) {
-  RunSafeCastTests<float, int8>();
-  RunSafeCastTests<double, int8>();
-  RunSafeCastTests<float, int16>();
-  RunSafeCastTests<double, int16>();
-  RunSafeCastTests<float, int32>();
-  RunSafeCastTests<double, int32>();
-  RunSafeCastTests<float, int64>();
-  RunSafeCastTests<double, int64>();
-  RunSafeCastTests<float, uint8>();
-  RunSafeCastTests<double, uint8>();
-  RunSafeCastTests<float, uint16>();
-  RunSafeCastTests<double, uint16>();
-  RunSafeCastTests<float, uint32>();
-  RunSafeCastTests<double, uint32>();
-  RunSafeCastTests<float, uint64>();
-  RunSafeCastTests<double, uint64>();
+  RunSafeCastTests<float, int8_t>();
+  RunSafeCastTests<double, int8_t>();
+  RunSafeCastTests<float, int16_t>();
+  RunSafeCastTests<double, int16_t>();
+  RunSafeCastTests<float, int32_t>();
+  RunSafeCastTests<double, int32_t>();
+  RunSafeCastTests<float, int64_t>();
+  RunSafeCastTests<double, int64_t>();
+  RunSafeCastTests<float, uint8_t>();
+  RunSafeCastTests<double, uint8_t>();
+  RunSafeCastTests<float, uint16_t>();
+  RunSafeCastTests<double, uint16_t>();
+  RunSafeCastTests<float, uint32_t>();
+  RunSafeCastTests<double, uint32_t>();
+  RunSafeCastTests<float, uint64_t>();
+  RunSafeCastTests<double, uint64_t>();
 }
 
 // Example taken from http://www.tensorflow.org/performance/quantization
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
index bcf5e4e..b862ae3 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -26,11 +26,6 @@
 
 namespace reference_ops {
 
-inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
-  return RuntimeShape(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
@@ -316,6 +311,37 @@
               DimsToShape(output_dims), output_data);
 }
 
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul4DSlow(
+      input1_data, input1_dims, input1_offset, input2_data, input2_dims,
+      input2_offset, output_offset, output_multiplier,
+      //
+      kReverseShift * output_shift,
+      //
+      output_activation_min, output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void AveragePool(const float* input_data, const Dims<4>& input_dims,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
index a5f4add..aa93e85 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -73,10 +73,12 @@
   for (int b = 0; b < n_batch; b++) {
     const float* matrix_ptr = matrix;
     for (int r = 0; r < m_rows; r++) {
+      float dot_prod = 0.0f;
       const float* vector_in_batch = vector + b * m_cols;
       for (int c = 0; c < m_cols; c++) {
-        *result_in_batch += *matrix_ptr++ * *vector_in_batch++;
+        dot_prod += *matrix_ptr++ * *vector_in_batch++;
       }
+      *result_in_batch += dot_prod;
       result_in_batch += result_stride;
     }
   }
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ace3af2..5634b83 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -105,6 +105,11 @@
 // Used mainly to convert from old-style shifts (right) to new-style (left).
 static constexpr int kReverseShift = -1;
 
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
@@ -271,12 +276,12 @@
                  int32 input_offset, const uint8* filter_data,
                  const Dims<4>& filter_dims, int32 filter_offset,
                  const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
                  gemmlowp::GemmContext* gemm_context) {
   (void)im2col_data;   // only used in optimized code.
   (void)im2col_dims;   // only used in optimized code.
@@ -302,8 +307,9 @@
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + filter_x;
-                const int in_y = in_y_origin + filter_y;
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
                 // If the location is outside the bounds of the input image,
                 // use zero as a default value.
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
@@ -335,6 +341,24 @@
   }
 }
 
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
@@ -546,8 +570,8 @@
       if (bias_data) {
         acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
       }
-      acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          acc, output_multiplier, kReverseShift * output_shift);
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                          kReverseShift * output_shift);
       acc += output_offset;
       acc = std::max(acc, output_activation_min);
       acc = std::min(acc, output_activation_max);
@@ -1374,13 +1398,144 @@
                output_dims);
 }
 
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
+    const uint8* input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1) {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i3 = 0; i3 < y3; ++i3) {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                         output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, params.output_multiplier,
+                  params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// Transitional version that will be moved shortly to legacy_reference_ops, as
+// part of RuntimeShape revisions.
+inline void BroadcastMul4DSlow(const uint8* input1_data,
+                               const Dims<4>& input1_dims, int32 input1_offset,
+                               const uint8* input2_data,
+                               const Dims<4>& input2_dims, int32 input2_offset,
+                               int32 output_offset, int32 output_multiplier,
+                               int output_shift, int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_data,
+                               const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
 
   NdArrayDesc<4> desc1;
@@ -1407,9 +1562,9 @@
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                  input1_val * input2_val, output_multiplier,
-                                  kReverseShift * output_shift);
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, output_multiplier, output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -1464,21 +1619,6 @@
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
-}
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -3370,28 +3510,50 @@
   }
 }
 
-template <typename T>
-inline void PadV2(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& left_paddings,
-                  const std::vector<int>& right_paddings, T* output_data,
-                  const Dims<4>& output_dims, const T pad_value) {
-  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
-  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
+  RuntimeShape ext_input_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  RuntimeShape ext_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
 
-  const int output_batch = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_depth = ArraySize(output_dims, 0);
+  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
+  // we can pad them to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[i] = op_params.right_padding[i];
+  }
 
-  const int left_b_padding = left_paddings[3];
-  const int left_h_padding = left_paddings[2];
-  const int left_w_padding = left_paddings[1];
-  const int left_d_padding = left_paddings[0];
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int output_depth = ext_output_shape.Dims(3);
 
-  const int right_b_padding = right_paddings[3];
-  const int right_h_padding = right_paddings[2];
-  const int right_w_padding = right_paddings[1];
-  const int right_d_padding = right_paddings[0];
+  const int left_b_padding = left_padding_copy[0];
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int left_d_padding = left_padding_copy[3];
+
+  const int right_b_padding = right_padding_copy[0];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+  const int right_d_padding = right_padding_copy[3];
+
+  const T pad_value = *pad_value_ptr;
 
   const T* in_ptr = input_data;
   T* out_ptr = output_data;
@@ -3417,7 +3579,59 @@
   }
 }
 
-// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32 when, say, the first is uint8.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                int32* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  // SetFloatOrInt(pad_value, &op_params.pad_value);
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -3428,13 +3642,15 @@
            output_dims, converted_pad_value);
 }
 
+// Old Pad that only padded with 0.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
 }
 
 template <typename T>
@@ -3491,31 +3707,39 @@
 }
 
 template <typename T>
-inline void Slice(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& begin, const std::vector<int>& size,
-                  T* output_data, const Dims<4>& output_dims) {
-  // TODO(dkalenichenko): This op only supports 4D tensors.
-  TFLITE_DCHECK_EQ(begin.size(), 4);
-  TFLITE_DCHECK_EQ(size.size(), 4);
-  const int start_b = begin[3];
-  const int stop_b =
-      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
-  const int start_h = begin[2];
-  const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
-  const int start_w = begin[1];
-  const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
-  const int start_d = begin[0];
-  const int stop_d =
-      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+  TFLITE_DCHECK_LE(op_params.begin_count, 4);
+  TFLITE_DCHECK_LE(op_params.size_count, 4);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
+                         ? ext_shape.Dims(0) - start_b
+                         : start_b + op_params.size[0];
+  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+                         ? ext_shape.Dims(1) - start_h
+                         : start_h + op_params.size[size_count - 3];
+  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+                         ? ext_shape.Dims(2) - start_w
+                         : start_w + op_params.size[size_count - 2];
+  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+                         ? ext_shape.Dims(3) - start_d
+                         : start_d + op_params.size[size_count - 1];
 
   T* out_ptr = output_data;
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
     for (int in_h = start_h; in_h < stop_h; ++in_h) {
       for (int in_w = start_w; in_w < stop_w; ++in_w) {
         for (int in_d = start_d; in_d < stop_d; ++in_d) {
-          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+          *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)];
         }
       }
     }
@@ -3523,6 +3747,22 @@
 }
 
 template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
 inline void Exp(const T* input_data, const size_t num_elements,
                 T* output_data) {
   for (size_t idx = 0; idx < num_elements; ++idx) {
@@ -3790,10 +4030,10 @@
 }
 
 template <typename T>
-void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
 
   auto min_value = input2_data[0];
   for (int i = 0; i < flat_size; i++) {
@@ -3802,10 +4042,10 @@
 }
 
 template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
 
   auto max_value = input2_data[0];
   for (int i = 0; i < flat_size; i++) {
@@ -3813,6 +4053,22 @@
   }
 }
 
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
 template <typename T, typename Op>
 void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
                               const T* input2_data, const Dims<4>& input2_dims,
diff --git a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
index a7dad3c..ca94e77 100644
--- a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
@@ -27,6 +27,7 @@
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+#include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
index 372a6ef..e8343f1 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
@@ -72,7 +72,7 @@
   static float input[kVectorSize] = {-640, -635.0, -630, 10.0,  2.0,
                                      -5.0, -10.0,  0.0,  1000.0};
 
-  int8 output[kVectorSize];
+  int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);
@@ -89,7 +89,7 @@
   constexpr int kVectorSize = 9;
   static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-  int8 output[kVectorSize];
+  int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);
@@ -105,7 +105,7 @@
   static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6,
                                      4e-5,  9e-6, 2e-4,  0};
 
-  int8 output[kVectorSize];
+  int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);
@@ -143,6 +143,7 @@
                                                -1., 3., 7., 3., 23., 3.})));
 }
 
+#ifdef __ANDROID__
 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   // Note we use 29 columns as this exercises all the neon kernel: the
   // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
@@ -166,13 +167,13 @@
       -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22,
       -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0};
 
-  int8* a_int8_data = reinterpret_cast<int8*>(
+  int8_t* a_int8_data = reinterpret_cast<int8_t*>(
       aligned_malloc(a_rows * a_cols, kWeightsPerUint32));
   float a_min, a_max;
   float scaling_factor_a;
   SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
                           &a_max, &scaling_factor_a);
-  const int8 expected_a_int8_data[] = {
+  const int8_t expected_a_int8_data[] = {
       /* 1st row */
       5,
       10,
@@ -363,7 +364,7 @@
   };
 
   // Quantized values of B:
-  int8 b_int8_data[b_rows * b_cols * batches];
+  int8_t b_int8_data[b_rows * b_cols * batches];
   float b_min, b_max;
   float scaling_factor_b[batches];
   SymmetricQuantizeFloats(b_float_data, b_rows * b_cols, b_int8_data, &b_min,
@@ -372,7 +373,7 @@
                           &b_int8_data[b_rows * b_cols], &b_min, &b_max,
                           &scaling_factor_b[1]);
 
-  const int8 expected_b_int8_data[] = {
+  const int8_t expected_b_int8_data[] = {
       /* batch 1 */
       127,
       -127,
@@ -465,6 +466,7 @@
 
   aligned_free(a_int8_data);
 }
+#endif  // __ANDROID__
 
 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index c44698b..204df9a 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -129,6 +129,13 @@
     }
   }
 
+  RuntimeShape(int shape_size, int32 value) : size_(0) {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i) {
+      SetDim(i, value);
+    }
+  }
+
   RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
     ReplaceWith(dimensions_count, dims_data);
   }
@@ -237,7 +244,7 @@
   bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
 
  private:
-  // For use only by ExtendFrom(), written to guarantee (return-value) copy
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
   // elision in C++17.
   // This creates a shape padded to the desired size with the specified value.
   RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
@@ -645,22 +652,6 @@
   }
 }
 
-struct PoolParams {
-  FusedActivationFunctionType activation;
-  PaddingType padding_type;
-  PaddingValues padding_values;
-  int stride_height;
-  int stride_width;
-  int filter_height;
-  int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-};
-
 enum class BroadcastableOpCategory : uint8 {
   kNone,
   kNonBroadcast,               // Matching input shapes.
@@ -669,6 +660,19 @@
   kGenericBroadcast,           // Fall-back.
 };
 
+struct MinMax {
+  float min;
+  float max;
+};
+static_assert(sizeof(MinMax) == 8, "");
+
+struct ActivationParams {
+  FusedActivationFunctionType activation_type;
+  // Quantized inference params.
+  int32 activation_min;
+  int32 activation_max;
+};
+
 // For Add, Sub, Mul ops.
 struct ArithmeticParams {
   // Shape dependent / common to data / op types.
@@ -704,6 +708,206 @@
   int broadcast_shape[5];
 };
 
+struct ConcatenationParams {
+  int8 axis;
+};
+
+struct ComparisonParams {
+  // uint8 inference params.
+  int left_shift;
+  int32 input0_offset;
+  int32 input0_multiplier;
+  int input0_shift;
+  int32 input1_offset;
+  int32 input1_multiplier;
+  int input1_shift;
+  // Shape dependent / common to inference types.
+  bool is_broadcast;
+};
+
+struct ConvParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int8 stride_width;
+  int8 stride_height;
+  int8 dilation_width_factor;
+  int8 dilation_height_factor;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+};
+
+struct DepthToSpaceParams {
+  int16 block_size;
+};
+
+struct DepthwiseParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int8 stride;
+  int8 depth_multiplier;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+};
+
+struct FakeQuantParams {
+  MinMax minmax;
+  int32 num_bits;
+};
+
+struct FullyConnectedParams {
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+  FullyConnectedWeightsFormat weights_format;
+};
+
+struct GatherParams {
+  int8 input_rank;
+  int16 axis;
+};
+
+struct L2NormalizationParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+};
+
+struct LocalResponseNormalizationParams {
+  int32 range;
+  double bias;
+  double alpha;
+  double beta;
+};
+
+struct LogisticParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
+struct LstmCellParams {
+  int32 weights_zero_point;
+  int32 accum_multiplier;
+  int accum_shift;
+  int state_integer_bits;
+};
+
+struct MeanParams {
+  int8 axis_count;
+  int16 axis[4];
+};
+
+struct PadParams {
+  int8 left_padding_count;
+  int32 left_padding[4];
+  int8 right_padding_count;
+  int32 right_padding[4];
+};
+
+struct PoolParams {
+  FusedActivationFunctionType activation;
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int stride_height;
+  int stride_width;
+  int filter_height;
+  int filter_width;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct ReshapeParams {
+  int8 shape_count;
+  int32 shape[4];
+};
+
+struct ResizeBilinearParams {
+  bool align_corners;
+};
+
+struct SliceParams {
+  int8 begin_count;
+  int32 begin[4];
+  int8 size_count;
+  int32 size[4];
+};
+
+struct SoftmaxParams {
+  // beta is not really used (not a Tensorflow parameter) and not implemented
+  // for LogSoftmax.
+  double beta;
+  // uint8 inference params.  Used even when beta defaults to 1.0.
+  int32 input_beta_multiplier;
+  int32 input_beta_left_shift;
+  // Reverse scaling is only used by LogSoftmax.
+  int32 reverse_scaling_divisor;
+  int32 reverse_scaling_right_shift;
+  int diff_min;
+};
+
+struct SpaceToDepthParams {
+  int16 block_size;
+};
+
+struct SplitParams {
+  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
+  // OperatorEdges are of type uint16.
+  uint16 num_split;
+};
+
+struct SqueezeParams {
+  int8 squeeze_dims_count;
+  int32 squeeze_dims[4];
+};
+
+struct StridedSliceParams {
+  int8 start_indices_count;
+  int16 start_indices[4];
+  int8 stop_indices_count;
+  int16 stop_indices[4];
+  int8 strides_count;
+  int16 strides[4];
+
+  int16 begin_mask;
+  int16 ellipsis_mask;
+  int16 end_mask;
+  int16 new_axis_mask;
+  int16 shrink_axis_mask;
+};
+
+struct TanhParams {
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
 template <typename T>
 inline void SetActivationParams(T min, T max, ArithmeticParams* params);
 
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
index 3f5bc4d..dd388df 100644
--- a/tensorflow/contrib/lite/kernels/mfcc.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -13,7 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/contrib/lite/kernels/mfcc_test.cc
index 0291ca8..69aa196 100644
--- a/tensorflow/contrib/lite/kernels/mfcc_test.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc_test.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "include/flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 349f3e6..561e39c 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -93,7 +93,6 @@
         input1->params.scale * input2->params.scale / output->params.scale;
     QuantizeMultiplierSmallerThanOneExp(
         real_multiplier, &data->output_multiplier, &data->output_shift);
-    data->output_shift *= -1;
   }
 
   return context->ResizeTensor(context, output, output_size);
@@ -161,9 +160,9 @@
     // The quantized version of Mul doesn't support activations, so we
     // always use BroadcastMul.
     if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, BroadcastMul);
+      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
     } else {
-      TF_LITE_MUL(optimized_ops, BroadcastMul);
+      TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 8d2c108..9681b90 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/util.h"
 
 namespace tflite {
 namespace ops {
@@ -127,9 +128,9 @@
 
 const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
                                                     int version) const {
-  // Return the NULL Op for all ops whose name start with "Eager:", allowing
+  // Return the NULL Op for all ops whose name start with "Eager", allowing
   // the interpreter to delegate their execution.
-  if (string(op).find("Eager:") == 0) {
+  if (IsEagerOp(op)) {
     static TfLiteRegistration null_op{
         nullptr, nullptr, &UnsupportedTensorFlowOp,
         nullptr, nullptr, BuiltinOperator_CUSTOM,
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 9edf5ba..7b9413c 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -26,6 +26,9 @@
 #ifndef TFLITE_MCU
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 #endif
+#if defined(TFLITE_EXTENDED)
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#endif
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
@@ -1040,6 +1043,14 @@
   }
   (**interpreter).SetVariables(std::move(variables));
 
+#if defined(TFLITE_EXTENDED)
+  if (auto delegate = EagerDelegate::Create()) {
+    (**interpreter)
+        .ModifyGraphWithDelegate(std::move(delegate),
+                                 /*allow_dynamic_tensors=*/true);
+  }
+#endif
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index becd1f6..42b8163 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -44,6 +44,19 @@
   return handle;
 }
 
+typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
+
+// ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+// which was added in 8.1.
+inline int ASharedMemory_create(const char* name, size_t size) {
+  static void* handle = loadLibrary("libandroid.so");
+  static ASharedMemory_create_fn fn =
+      handle != nullptr ? reinterpret_cast<ASharedMemory_create_fn>(
+                              dlsym(handle, "ASharedMemory_create"))
+                        : nullptr;
+  return fn(name, size);
+}
+
 inline void* getLibraryHandle() {
   static void* handle = loadLibrary("libneuralnetworks.so");
   return handle;
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 13325a8..45c92a8 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -24,20 +24,27 @@
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 
 #ifdef __ANDROID__
+#include <android/log.h>
 #include <sys/system_properties.h>
 #endif
 
 namespace tflite {
 
 void logError(const char* format, ...) {
-  // TODO(mikie): use android logging, stderr is not captured for Java
-  // applications
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
+  // stderr is convenient for native tests, but is not captured for apps
+  va_list args_for_stderr;
+  va_start(args_for_stderr, format);
+  vfprintf(stderr, format, args_for_stderr);
+  va_end(args_for_stderr);
   fprintf(stderr, "\n");
   fflush(stderr);
+#ifdef __ANDROID__
+  // produce logcat output for general consumption
+  va_list args_for_log;
+  va_start(args_for_log, format);
+  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
+  va_end(args_for_log);
+#endif
 }
 
 #define FATAL(...)       \
@@ -564,8 +571,14 @@
         nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
         if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
                 ->activation != kTfLiteActNone) {
-          FATAL(
+          logError(
               "NNAPI does not support L2Normalization with fused activations");
+          return kTfLiteError;
+        }
+        if ((node.inputs->size > 0) &&
+            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+          logError("NNAPI only supports input rank 4 for L2Normalization");
+          return kTfLiteError;
         }
         break;
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 860aff9..47f0c8e 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -112,8 +112,11 @@
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index ec49738..11d4bdb 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -54,7 +54,7 @@
   """Convert `input_data_str` according to model and toco parameters.
 
   Unless you know what you are doing consider using
-  the more friendly @{tf.contrib.lite.toco_convert}}.
+  the more friendly `tf.contrib.lite.toco_convert`.
 
   Args:
     model_flags_str: Serialized proto describing model properties, see
diff --git a/tensorflow/contrib/lite/python/convert_test.py b/tensorflow/contrib/lite/python/convert_test.py
index dc21a9b..bc05514 100644
--- a/tensorflow/contrib/lite/python/convert_test.py
+++ b/tensorflow/contrib/lite/python/convert_test.py
@@ -113,12 +113,13 @@
       # and 1 final output).
       self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["cool_activation", "Const", "Identity"])
 
   def testScaleAndBiasAndIdentity(self):
@@ -139,12 +140,13 @@
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
 
   def testTwoFunctions(self):
@@ -153,7 +155,7 @@
     b = array_ops.constant([1.])
     def _double_values(x):
       custom = op_hint.OpHint("add_test")
-      x = custom.add_inputs(x)
+      x, = custom.add_inputs(x)
       output = math_ops.multiply(x, x)
       output, = custom.add_outputs(output)
       return output
@@ -164,13 +166,90 @@
       # make sure one identity for each input (2) and output (2) => 2 + 2
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["add_test", "Const", "Identity", "Add"])
 
+  def _get_input_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_INPUT_INDEX_ATTR].i
+
+  def _get_output_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i
+
+  def _get_sort_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_SORT_INDEX_ATTR].i
+
+  def testTags(self):
+    """Test if multiple args with the same tag are grouped."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([2.])
+    c = array_ops.constant([3.])
+    d = array_ops.constant([4.])
+    custom = op_hint.OpHint("test_tag")
+    a = custom.add_input(a, tag="mytag",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b, = custom.add_inputs(b)
+    c = custom.add_input(c, tag="mytag",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    d = custom.add_input(d, tag="mytag2",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    res = math_ops.add(math_ops.mul(a, b), math_ops.mul(c, b))
+    custom.add_outputs([res])
+    with self.test_session():
+      self.assertEqual(self._get_input_index(a), 0)
+      self.assertEqual(self._get_sort_index(a), 0)
+      self.assertEqual(self._get_input_index(b), 1)
+      self.assertEqual(self._get_input_index(c), 0)
+      self.assertEqual(self._get_sort_index(c), 1)
+
+  def testOverrideIndex(self):
+    a = array_ops.constant([1.])
+    b = array_ops.constant([2.])
+    c = array_ops.constant([3.])
+    custom = op_hint.OpHint("test_override")
+    b = custom.add_input(b)  # should auto assign 0
+    a = custom.add_input(a, index_override=1)
+    c = custom.add_input(c)  # should auto assign 2
+    with self.test_session():
+      self.assertEqual(self._get_input_index(a), 1)
+      self.assertEqual(self._get_input_index(b), 0)
+      self.assertEqual(self._get_input_index(c), 2)
+
+  def testAggregate(self):
+    a = array_ops.constant([3., 4.])
+    b = array_ops.constant([5., 6.])
+    hint = op_hint.OpHint("agg")
+    a0, a1 = array_ops.unstack(a)
+    b0, b1 = array_ops.unstack(b)
+
+    a0 = hint.add_input(a0, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b0 = hint.add_input(b0, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    a1 = hint.add_input(a1, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b1 = hint.add_input(b1, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+
+    c0 = math_ops.add(a0, b0, name="addleft")
+    c1 = math_ops.add(a1, b1, name="addright")
+    c0 = hint.add_output(
+        c0, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    c1 = hint.add_output(
+        c1, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+
+    curr = array_ops.stack([c0, c1])
+    output = array_ops.identity(curr, name="FINAL_OUTPUT")
+    with self.test_session() as sess:
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
+      print(stubbed_graphdef)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
+          ["agg", "Const", "Identity"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index 3243bdd..1be61fe 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -54,6 +54,10 @@
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
     elif model_content and not model_path:
+      # Take a reference, so the pointer remains valid.
+      # Since python strings are immutable then PyString_XX functions
+      # will always return the same pointer.
+      self._model_content = model_content
       self._interpreter = (
           _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
               model_content))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 3e03751..641dd93 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -15,12 +15,15 @@
 #ifndef TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 #define TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 
-// Place `<locale>` before <Python.h> to avoid build failures in macOS.
-#include <locale>
 #include <memory>
 #include <string>
 #include <vector>
 
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
 #include <Python.h>
 
 // We forward declare TFLite classes here to avoid exposing them to SWIG.
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 2f9b9d4..5ec5203 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -53,8 +53,8 @@
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework import graph_util as _tf_graph_util
+from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
-from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
 
@@ -194,42 +194,41 @@
         The graph is not frozen.
         input_arrays or output_arrays contains an invalid tensor name.
     """
-    with _session.Session() as sess:
-      sess.run(_global_variables_initializer())
-
-      # Read GraphDef from file.
-      graph_def = _graph_pb2.GraphDef()
-      with open(graph_def_file, "rb") as f:
-        file_content = f.read()
-      try:
-        graph_def.ParseFromString(file_content)
-      except (_text_format.ParseError, DecodeError):
+    with _ops.Graph().as_default():
+      with _session.Session() as sess:
+        # Read GraphDef from file.
+        graph_def = _graph_pb2.GraphDef()
+        with open(graph_def_file, "rb") as f:
+          file_content = f.read()
         try:
-          print("Ignore 'tcmalloc: large alloc' warnings.")
-
-          if not isinstance(file_content, str):
-            if PY3:
-              file_content = file_content.decode('utf-8')
-            else:
-              file_content = file_content.encode('utf-8')
-          _text_format.Merge(file_content, graph_def)
+          graph_def.ParseFromString(file_content)
         except (_text_format.ParseError, DecodeError):
-          raise ValueError(
-              "Unable to parse input file '{}'.".format(graph_def_file))
-      sess.graph.as_default()
-      _import_graph_def(graph_def, name="")
+          try:
+            print("Ignore 'tcmalloc: large alloc' warnings.")
 
-      # Get input and output tensors.
-      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
-      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
-      _set_tensor_shapes(input_tensors, input_shapes)
+            if not isinstance(file_content, str):
+              if PY3:
+                file_content = file_content.decode("utf-8")
+              else:
+                file_content = file_content.encode("utf-8")
+            _text_format.Merge(file_content, graph_def)
+          except (_text_format.ParseError, DecodeError):
+            raise ValueError(
+                "Unable to parse input file '{}'.".format(graph_def_file))
+        _import_graph_def(graph_def, name="")
 
-      # Check if graph is frozen.
-      if not _is_frozen_graph(sess):
-        raise ValueError("Please freeze the graph using freeze_graph.py.")
+        # Get input and output tensors.
+        input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+        output_tensors = _get_tensors_from_tensor_names(sess.graph,
+                                                        output_arrays)
+        _set_tensor_shapes(input_tensors, input_shapes)
 
-      # Create TocoConverter class.
-      return cls(sess.graph_def, input_tensors, output_tensors)
+        # Check if graph is frozen.
+        if not _is_frozen_graph(sess):
+          raise ValueError("Please freeze the graph using freeze_graph.py.")
+
+        # Create TocoConverter class.
+        return cls(sess.graph_def, input_tensors, output_tensors)
 
   @classmethod
   def from_saved_model(cls,
@@ -427,7 +426,6 @@
     Frozen GraphDef.
   """
   if not _is_frozen_graph(sess):
-    sess.run(_global_variables_initializer())
     output_arrays = [_tensor_name(tensor) for tensor in output_tensors]
     return _tf_graph_util.convert_variables_to_constants(
         sess, sess.graph_def, output_arrays)
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index ca2af5a..2f13684 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -33,6 +33,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
@@ -198,6 +199,7 @@
         'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
     out_tensor = in_tensor + var
     sess = session.Session()
+    sess.run(_global_variables_initializer())
 
     # Convert model and ensure model is not None.
     converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
@@ -655,9 +657,7 @@
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    os.remove(keras_file)
-
-    # Check values from converted model.
+    # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
@@ -675,6 +675,18 @@
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+    # Check inference of converted model.
+    input_data = np.array([[1, 2, 3]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interpreter.invoke()
+    tflite_result = interpreter.get_tensor(output_details[0]['index'])
+
+    keras_model = keras.models.load_model(keras_file)
+    keras_result = keras_model.predict(input_data)
+
+    np.testing.assert_almost_equal(tflite_result, keras_result, 5)
+    os.remove(keras_file)
+
   def testSequentialModelInputArray(self):
     """Test a Sequential tf.keras model testing input arrays argument."""
     keras_file = self._getSequentialModel()
@@ -755,17 +767,17 @@
 
     model.predict(x)
     fd, keras_file = tempfile.mkstemp('.h5')
-    keras.models.save_model(model, keras_file)
+    try:
+      keras.models.save_model(model, keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
     converter = lite.TocoConverter.from_keras_model_file(keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    os.close(fd)
-    os.remove(keras_file)
-
-    # Check values from converted model.
+    # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
@@ -783,6 +795,18 @@
     self.assertTrue(([1, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+    # Check inference of converted model.
+    input_data = np.array([[1, 2, 3]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interpreter.invoke()
+    tflite_result = interpreter.get_tensor(output_details[0]['index'])
+
+    keras_model = keras.models.load_model(keras_file)
+    keras_result = keras_model.predict(input_data)
+
+    np.testing.assert_almost_equal(tflite_result, keras_result, 5)
+    os.remove(keras_file)
+
   def testFunctionalModelMultipleInputs(self):
     """Test a Functional tf.keras model with multiple inputs and outputs."""
     a = keras.layers.Input(shape=(3,), name='input_a')
@@ -865,17 +889,17 @@
 
     model.predict(x)
     fd, keras_file = tempfile.mkstemp('.h5')
-    keras.models.save_model(model, keras_file)
+    try:
+      keras.models.save_model(model, keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
     converter = lite.TocoConverter.from_keras_model_file(keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    os.close(fd)
-    os.remove(keras_file)
-
-    # Check values from converted model.
+    # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
@@ -893,6 +917,18 @@
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+    # Check inference of converted model.
+    input_data = np.array([[1, 2, 3]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interpreter.invoke()
+    tflite_result = interpreter.get_tensor(output_details[0]['index'])
+
+    keras_model = keras.models.load_model(keras_file)
+    keras_result = keras_model.predict(input_data)
+
+    np.testing.assert_almost_equal(tflite_result, keras_result, 5)
+    os.remove(keras_file)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
index 7908689..8c92013 100644
--- a/tensorflow/contrib/lite/python/op_hint.py
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -25,9 +25,9 @@
   def tflite_cool_activation(input):
     # A cool activation function.
     custom = tf.contrib.lite.OpHint("cool_activation")
-    input = custom.add_inputs(input)
+    input, = custom.add_inputs(input)
     output = tf.sigmoid(input) * input
-    custom.add_outputs(output)
+    output, = custom.add_outputs(output)
     return output
 
   image = tf.placeholder(tf.float32, (1, 16, 16, 1))
@@ -64,18 +64,27 @@
 understood by toco later.
 """
 
+# TODO(aselle): Make this use generic graph transformations.
+# TODO(aselle): _tensor_name_base should be called _tensor_name_to_op_name.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections as _collections
-import itertools as _itertools
+import copy as _copy
 import uuid as _uuid
+import six as _six
 
-from tensorflow.contrib import framework as _framework
 from tensorflow.core.framework import attr_value_pb2 as _attr_value_pb2
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.core.framework import node_def_pb2 as _node_def_pb2
 from tensorflow.python.framework import ops as _ops
+# TODO(aselle): publicize these apis if we continue to use these.
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -97,11 +106,174 @@
   constructs, this mechanism can be retired and changed to use python defun's.
   """
 
-  # Attr constants that are used for representation in the GraphDef
+  # Attr constants that are used for representation in the GraphDef. These
+  # will be used on every Identity op that is involved in a total OpHint.
+
+  # Name of the OpHint function (cosmetic).
   FUNCTION_NAME_ATTR = "_tflite_function_name"
+  # UUID of the function (each OpHint gets a new uuid).
   FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  # The index index of the input (or nothing if it is an output).
   FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  # The output index of the output (or nothing if it is an input).
   FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+  # An index that orders aggregate arguments. Aggregate arguments are ones
+  # that are separate but will be fused horizontally. For example a static LSTM
+  # has a lstm cell for each time step. Each one has a separate opHint, but a
+  # fused SequentialLSTM will treat this as a single tensor.
+  FUNCTION_SORT_INDEX_ATTR = "_tflite_function_sort_index"
+  # The way in which multiple parts of the aggregate argument will be joined
+  # into a fused operand. Valid options are OpHint.AGGREGATE_FIRST,
+  # OpHint.AGGREGATE_LAST, OpHint.AGGREGATE_STACK.
+  FUNCTION_AGGREGATE_ATTR = "_tflite_function_aggregate"
+  # On fused OpHint stub, the order of inputs that the final LSTM call will
+  # have. What this means is that the TensorFlow order might be
+  # "foo", "bar", "stuff" and you might want the TF lite op order to be
+  # "stuff", "foo", "bar", -1 (where -1 is unused). So you would set this
+  # attribute to [2, 0, 1, -1].
+  TFLITE_INPUT_INDICES = "_tflite_input_indices"
+
+  # Types of aggregations
+  #  stack: stacks all ophints with matching tags. i.e. for a static rnn.
+  #   specifically, this is good for an input or output to a static rnn cell.
+  AGGREGATE_STACK = _compat.as_bytes("stack")
+  # first: only takes the first output (one with lowest sort index)
+  # of matching tags. This is good for the input state to an RNN.
+  AGGREGATE_FIRST = _compat.as_bytes("first")
+  # aggregation last takes only the last tag (one with highest sort index).
+  # This is good for an output value on the last stack item of a
+  # static rnn.
+  AGGREGATE_LAST = _compat.as_bytes("last")
+
+  class OpHintArgumentTracker(object):
+    """Conceptually tracks indices of arguments of "OpHint functions".
+
+    The inputs and arguments of these functions both use an instance
+    of the class so they can have independent numbering."""
+
+    def __init__(self, function_name, unique_function_id, node_name_prefix,
+                 attr_name):
+      """Initialize ophint argument.
+
+      Args:
+        function_name: Name of the function that this tracks arguments for.
+        unique_function_id: UUID of function that this tracks arguments for.
+        node_name_prefix: How identities that are created are named.
+        attr_name: Name of attribute to use to store the index for this hint.
+          i.e. FUNCTION_INPUT_INDEX or FUNCTION_OUTPUT_INDEX
+      """
+
+      # The global index is the argument index of the op. This is in contrast
+      # to the sort index which is the sequence number of a particular instance
+      # of a given global index. For example, you may have called add hint
+      # twice with the tag "foo". Then the global index will be 0 for both
+      # and the sort index will be 0 for the first added and 1 for the second.
+      self._function_name = function_name
+      self._unique_function_id = unique_function_id
+      self._next_global_index = 0  # The absolute global index
+      self._used_global_indices = set()
+      self._tag_to_global_index = {}  # The argument index a given tag maps to
+      self._tag_to_next_sort_index = {}  # The current index for each tag
+      self._node_name_prefix = node_name_prefix
+      self._attr_name = attr_name
+
+    def _get_new_global_index(self, index_override):
+      """Return the next unused argument index in order or use an override.
+
+      Args:
+        index_override: An index to use instead of the next available or None
+          to use the next available.
+
+      Returns:
+        A valid global_index to use for the next hint argument.
+
+      Raises:
+        ValueError: If the index_override is already used by another hint.
+      """
+      if index_override is None:
+        global_index = self._next_global_index
+      else:
+        if index_override in self._used_global_indices:
+          raise ValueError("Index %d was already used by another call to add")
+        global_index = index_override
+      # Make next_global_index valid
+      self._used_global_indices.add(global_index)
+      while self._next_global_index in self._used_global_indices:
+        self._next_global_index += 1
+      return global_index
+
+    def add(self, arg, tag=None, name=None, aggregate=None,
+            index_override=None):
+      """Return a wrapped tensor of an input tensor as an argument.
+
+      Args:
+        arg: A TensorFlow tensor that should be considered an argument.
+        tag: String tag to identify arguments that should be packed.
+        name: Name of argument. This is included in the Identity hint op names.
+        aggregate: Strategy to aggregate.
+        Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+          Note, aggregate is only valid if tag is specified.
+        index_override: Specify what input/output index should this be in the
+          final stub. i.e. add(arg0, index=1); add(arg1, index=0) wil make the
+          final stub be as stub_func(inputs[arg1, arg0], outputs=[]) rather than
+          the default call order based ordering.
+
+      Returns:
+        A tensor representing the wrapped argument.
+
+      Raises:
+        ValueError: When indices are not consistent.
+      """
+
+      # Find the appropriate index
+      if tag is None:
+        if aggregate is not None:
+          raise ValueError("You must specify `tag` if using aggregate.")
+        global_index = self._get_new_global_index(index_override)
+        sort_index = None
+      else:
+        if aggregate is None:
+          raise ValueError("You must specify `aggregate` if using tag.")
+        if tag not in self._tag_to_global_index:
+          self._tag_to_global_index[tag] = (
+              self._get_new_global_index(index_override))
+          self._tag_to_next_sort_index[tag] = 0
+        elif (index_override and
+              index_override != self._tag_to_global_index[tag]):
+          raise ValueError(
+              "Tag %r was called with two indices %r and %r" %
+              (tag, index_override, self._tag_to_global_index[tag]))
+        global_index = self._tag_to_global_index[tag]
+        sort_index = self._tag_to_next_sort_index[tag]
+        self._tag_to_next_sort_index[tag] += 1
+
+      uuid = self._unique_function_id
+      name = "%s-%s-%s-%r-%r-%s" % (self._node_name_prefix, self._function_name,
+                                    uuid, global_index, sort_index, name)
+      identity_op = _array_ops.identity(arg, name=name)
+
+      # pylint: disable=protected-access
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_NAME_ATTR,
+          _attr_value_pb2.AttrValue(
+              s=_compat.as_bytes(self._function_name)))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_UUID_ATTR,
+          _attr_value_pb2.AttrValue(
+              s=_compat.as_bytes(self._unique_function_id)))
+      identity_op.op._set_attr(
+          self._attr_name, _attr_value_pb2.AttrValue(i=global_index))
+      if sort_index is not None:
+        identity_op.op._set_attr(
+            OpHint.FUNCTION_SORT_INDEX_ATTR,
+            _attr_value_pb2.AttrValue(i=sort_index))
+      if aggregate is not None:
+        identity_op.op._set_attr(
+            OpHint.FUNCTION_AGGREGATE_ATTR,
+            _attr_value_pb2.AttrValue(s=_compat.as_bytes((aggregate))))
+      # pylint: enable=protected-access
+      return identity_op
 
   def __init__(self, function_name, **kwargs):
     """Create a OpHint.
@@ -112,10 +284,14 @@
     """
     self._function_name = function_name
     self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
-    self._curr_input_index = 0
-    self._curr_output_index = 0
     self._attrs_to_store_later = kwargs
     self._stored_attrs = False
+    self._inputs = OpHint.OpHintArgumentTracker(
+        self._function_name, self._unique_function_id, "InputHint",
+        OpHint.FUNCTION_INPUT_INDEX_ATTR)
+    self._outputs = OpHint.OpHintArgumentTracker(
+        self._function_name, self._unique_function_id, "OutputHint",
+        OpHint.FUNCTION_OUTPUT_INDEX_ATTR)
 
   def _setattr(self, dest_op, name, value):
     tensor_value = _ops.convert_to_tensor(value)
@@ -124,68 +300,278 @@
         tensor=tensor_value.op.node_def.attr["value"].tensor))
     # pylint: enable=protected-access
 
-  def add_inputs(self, *args):
+  def add_input(self, *args, **kwargs):
+    """Add a wrapped input argument to the hint.
+
+    Args:
+      *args: The input tensor.
+      **kwargs:
+        "name" label
+        "tag" a tag to group multiple arguments that will be aggregated. I.e.
+          a string like 'cool_input'. Basically multiple inputs can be added
+          to the same hint for parallel operations that will eventually be
+          combined. An example would be static_rnn which creates multiple copies
+          of state or inputs.
+        "aggregate" aggregation strategy that is valid only for tag non None.
+          Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+        "index_override" The global index to use. This corresponds to the
+          argument order in the final stub that will be generated.
+    Returns:
+      The wrapped input tensor.
+    """
+    return self._inputs.add(*args, **kwargs)
+
+  def add_output(self, *args, **kwargs):
+    """Add a wrapped output argument to the hint.
+
+    Args:
+      *args: The output tensor.
+      **kwargs:
+        "name" label
+        "tag" a tag to group multiple arguments that will be aggregated. I.e.
+          a string like 'cool_input'. Basically multiple inputs can be added
+          to the same hint for parallel operations that will eventually be
+          combined. An example would be static_rnn which creates multiple copies
+          of state or inputs.
+        "aggregate" aggregation strategy that is valid only for tag non None.
+          Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+        "index_override" The global index to use. This corresponds to the
+          argument order in the final stub that will be generated.
+    Returns:
+      The wrapped output tensor.
+    """
+    return self._outputs.add(*args, **kwargs)
+
+  def add_inputs(self, *args, **kwargs):
     """Add a sequence of inputs to the function invocation.
 
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
+      **kwargs: This allows 'names' which should be a list of names.
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
     """
+    if "names" in kwargs:
+      return [
+          self._inputs.add(arg, name=name)
+          for arg, name in zip(args, kwargs["names"])
+      ]
+    else:
+      return [self._inputs.add(arg) for arg in args]
 
-    def augmented_identity(arg):
-      identity_op = _array_ops.identity(arg)
-      # pylint: disable=protected-access
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_NAME_ATTR,
-          _attr_value_pb2.AttrValue(s=self._function_name))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_UUID_ATTR,
-          _attr_value_pb2.AttrValue(s=self._unique_function_id))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_INPUT_INDEX_ATTR,
-          _attr_value_pb2.AttrValue(i=self._curr_input_index))
-      # pylint: enable=protected-access
-      self._curr_input_index += 1
-      return identity_op
-
-    return [augmented_identity(arg) for arg in args]
-
-  def add_outputs(self, *args):
+  def add_outputs(self, *args, **kwargs):
     """Add a sequence of outputs to the function invocation.
 
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
+      **kwargs: See
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
     """
+    if "names" in kwargs:
+      return [
+          self._outputs.add(arg, name=name)
+          for arg, name in zip(args, kwargs["names"])
+      ]
+    else:
+      return [self._outputs.add(arg) for arg in args]
 
-    def augmented_identity(arg):
-      identity_op = _array_ops.identity(arg)
-      # pylint: disable=protected-access
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_NAME_ATTR,
-          _attr_value_pb2.AttrValue(s=self._function_name))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_UUID_ATTR,
-          _attr_value_pb2.AttrValue(s=self._unique_function_id))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_OUTPUT_INDEX_ATTR,
-          _attr_value_pb2.AttrValue(i=self._curr_output_index))
-      # pylint: enable=protected-access
-      self._curr_output_index += 1
-      return identity_op
 
-    wrapped_outputs = [augmented_identity(arg) for arg in args]
+class _LiteOperand(object):
+  """Abstract operand for a tflite hint function.
 
-    if not self._stored_attrs:
-      for key, value in self._attrs_to_store_later.iteritems():
-        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
-      self._stored_attrs = True
+  This is a base class that handles representing arguments to an OpHint.
+  It also is able to serialize operands to the stubbed graph_def.
+  Child classes are responsible for being able to
+  store information about the hint identity operators. They are also responsible
+  for knowing how to serialize to output graphdefs.
 
-    return wrapped_outputs
+  Typically this will be implemented by holding one or more identity nodes
+  that were previously discovered as hints.
+  """
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    """This adds the node(s) to out_graphdef and returns the input node name.
+
+    Args:
+      out_graphdef: A graphdef that is ready to have this input added.
+
+    Returns:
+      The the output that the stub should use as an input for this operand.
+
+    Raises:
+      RuntimeError: if the method is not implemented.
+    """
+    del out_graphdef
+    raise RuntimeError("Unimplemented abstract method.")
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, output_index,
+                                           out_graphdef):
+    """Add node(s) to graph representing output operands and returns type.
+
+    Args:
+      fused_op_name: name of the fused op stub name.
+      output_index: Output index that we are currently processing from stub.
+      out_graphdef: The destination graphdef we are currently building up.
+
+    Returns:
+      The datatype of this identity.
+
+    Raises:
+      RuntimeError: if the method is not implemented.
+    """
+    del fused_op_name, output_index, out_graphdef
+    raise RuntimeError("Unimplemented abstract method.")
+
+
+class _LiteSingleOperand(_LiteOperand):
+  """A simple operand that is non-aggregated (i.e. most hints)."""
+
+  def __init__(self, node):
+    _LiteOperand.__init__(self)
+    self.node = node
+    self.name = _tensor_name_base(node.name)
+
+  def flatten(self):
+    return [self.name]
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    return self.name
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, index,
+                                           out_graphdef):
+    output_node = _copy.deepcopy(self.node)
+    del output_node.input[:]
+    output_node.input.append(_tensorflow_output_name(fused_op_name, index))
+    out_graphdef.node.extend([output_node])
+    return self.node.attr["type"].i
+
+  def __str__(self):
+    return str(self.name)
+
+
+class _LiteAggregateOperand(_LiteOperand):
+  """An operand for a tflite hint function that is aggregated from many.
+
+  For example, an LSTM is a grid of operators that are all related. Inputs
+  going into them may need to be fused, so they should all be tracked as
+  related arguments.
+  """
+
+  def __init__(self, aggregation):
+    _LiteOperand.__init__(self)
+    self.aggregation = aggregation
+    self.names = {}
+    self.nodes = {}
+    self.flattened = None
+
+  def add(self, sort, node):
+    self.names[sort] = _tensor_name_base(node.name)
+    self.nodes[sort] = node
+
+  def flatten_nodes(self):
+    """Return a list of all the node protos in aggregation sorted order."""
+    if not self.flattened:
+      self.flattened = [None] * len(self.nodes)
+      for idx, node in _six.iteritems(self.nodes):
+        self.flattened[idx] = node
+      for n in self.nodes:
+        if n is None:
+          raise RuntimeError("Aggregate was missing argument.")
+      if self.aggregation == OpHint.AGGREGATE_FIRST:
+        self.flattened = self.flattened[:1]
+      elif self.aggregation == OpHint.AGGREGATE_LAST:
+        self.flattened = self.flattened[-1:]
+      elif self.aggregation == OpHint.AGGREGATE_STACK:
+        pass
+      else:
+        raise ValueError(
+            "Invalid aggregation type %r specified" % self.aggregation)
+    return self.flattened
+
+  def flatten(self):
+    """Return a list of all node names in aggregation sorted sorter."""
+    return [_tensor_name_base(x.name) for x in self.flatten_nodes()]
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    """This adds the nodes to out_graphdef and returns an aggregated output.
+
+    In particular, if you have 4 inputs to a hint stub, this will be the
+    node that you can use as an output. I.e. you have 4 timesteps from a
+    static rnn, then a fused UnidriecitonalLSTM will expect 1 input with
+    all 4 time steps. So here we make a pack and return the output name of
+    that pack.
+
+    Args:
+      out_graphdef: A graphdef that is ready to have this input added.
+
+    Returns:
+      The name of a pack that aggregates this node.
+    """
+    flattened = self.flatten_nodes()
+    if len(flattened) == 1:
+      return _tensor_name_base(flattened[0].name)
+    else:
+      new_node = _node_def_pb2.NodeDef()
+      new_node.op = "Pack"
+      new_node.name = "OpHintStack-%s" % flattened[0].name
+      new_node.attr["N"].i = len(flattened)
+      new_node.attr["T"].type = flattened[0].attr["T"].type
+      for discrete in flattened:
+        new_node.input.append(_tensor_name_base(discrete.name))
+      out_graphdef.node.extend([new_node])
+      return new_node.name
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, output_index,
+                                           out_graphdef):
+    """This adds to `out_graphdef` all the unaggregated outputs.
+
+    I.e. we are outputting from a fused stub, but we need to make it compatible
+    with the unfused original graph so we insert an unpack. Ideally in a later
+    stage the unpack -> pack sequences will be removed.
+
+    Args:
+      fused_op_name: The name of the stub we are in the process of fusing.
+      output_index: The output output_index this object represents.
+      out_graphdef: The graphdef we are in the process of buildings
+
+    Returns:
+      The type of the aggregated output (so we can finish building the stub
+      op).
+    """
+    flattened = self.flatten_nodes()
+    if len(flattened) == 1:
+      temp_op = _LiteSingleOperand(flattened[0])
+      return temp_op.aggregate_and_return_name_for_output(
+          fused_op_name, output_index, out_graphdef)
+    else:
+      stack_node = _node_def_pb2.NodeDef()
+      stack_node.op = "Unpack"
+      stack_node.name = "OpHintUnstack-%s" % flattened[0].name
+      stack_node.attr["num"].i = len(flattened)
+      output_type = flattened[0].attr["T"].type
+      stack_node.attr["T"].type = output_type
+      stack_node.input.append(_tensorflow_output_name(
+          fused_op_name, output_index))
+      out_graphdef.node.extend([stack_node])
+
+      for idx, discrete in enumerate(flattened):
+        output_node = _copy.deepcopy(discrete)
+        del output_node.input[:]
+        output_node.input.append(_tensorflow_output_name(stack_node.name, idx))
+        out_graphdef.node.extend([output_node])
+
+      return output_type
+
+  def __str__(self):
+    s = "\t\t\tAGGREGATE %s\n" % self.aggregation
+    for sort, val in self.names.iteritems():
+      s += "\t\t\t%d: %s\n" % (sort, val)
+    return s
 
 
 class _LiteFuncCall(object):
@@ -212,46 +598,87 @@
     self.uuid = None
     self.params = {}
 
+  def flattened_inputs_and_outputs(self):
+    """Return a list of inputs and outputs in a flattened format.
+
+    Returns:
+      Tuple of (inputs, outputs). where input and output i a list of names.
+    """
+    def _flatten(input_or_output_dict):
+      flattened_items = []
+      for item in input_or_output_dict.values():
+        flattened_items.extend(item.flatten())
+      return flattened_items
+
+    return _flatten(self.inputs), _flatten(self.outputs)
+
   def __str__(self):
-    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
-        self.function_name, self.uuid, self.inputs, self.outputs)
+    def format_args(items):
+      s = ""
+      for idx, item in items.iteritems():
+        s += ("\t\t%d:\n" % idx) + str(item)
+      return s
+
+    inputs_str = "\tInputs\n" + format_args(self.inputs)
+    outputs_str = "\tOutputs\n" + format_args(self.outputs)
+
+    return ("tflite function %s call %s\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s"
+            % (self.function_name, self.uuid, inputs_str, outputs_str))
 
 
-def _find_all_hints_in_graph_def(session):
+def _find_all_hints_in_graph_def(graphdef):
   """Look at the current default graph and return a list of LiteFuncCall objs.
 
   Args:
-    session: A TensorFlow session that contains the graph to convert.
+    graphdef: A TensorFlow graph_def to look for LiteFuncCalls.
   Returns:
     a list of `LifeFuncCall` objects in the form
 
   """
   func_calls = _collections.defaultdict(_LiteFuncCall)
-  seen_ops = set()
 
-  for op in session.graph.get_operations():
-    for operand in _itertools.chain(op.inputs, op.outputs):
-      if operand in seen_ops:
-        continue
-      seen_ops.add(operand)
-      attr = operand.op.node_def.attr
-      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
-      if OpHint.FUNCTION_UUID_ATTR not in attr:
-        continue
-      call_def = func_calls[uuid]
-      call_def.uuid = uuid
-      if OpHint.FUNCTION_UUID_ATTR in attr:
-        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
-        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
-          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
-        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
-          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
+  for node in graphdef.node:
+    attr = node.attr
+    # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
+    uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+    if (OpHint.FUNCTION_UUID_ATTR not in attr
+        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+      continue
 
-      for a in attr:
-        if a.startswith("_tflite_attr_"):
-          # TODO(aselle): Remember the attribute tensors so we can put them
-          # in collapse.
-          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+    # Start building function
+    call_def = func_calls[uuid]
+    call_def.uuid = uuid
+    call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+    # Get sorting and aggregation information
+
+    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1: sort = None
+    aggregation = None
+    if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
+      aggregation = attr[OpHint.FUNCTION_AGGREGATE_ATTR].s
+
+    # Add the input or output
+    def put_operand(stuff, index, sort, operand, aggregation):
+      """Add a given index into the function structure."""
+      if sort is None:
+        stuff[index] = _LiteSingleOperand(operand)
+      else:
+        if index not in stuff:
+          stuff[index] = _LiteAggregateOperand(aggregation)
+        stuff[index].add(sort, operand)
+
+    if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+      put_operand(call_def.inputs, attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i,
+                  sort, node, aggregation)
+    if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+      put_operand(call_def.outputs, attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i,
+                  sort, node, aggregation)
+
+    # Remember attributes
+    for a in attr:
+      if a.startswith("_tflite_attr_"):
+        call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
 
   return func_calls
 
@@ -267,42 +694,305 @@
   Returns:
     A name without any device assignment.
   """
-  return full_tensor_name.name.split(":")[0]
+  if full_tensor_name.startswith("^"):
+    return full_tensor_name[1:]
+  return full_tensor_name.split(":")[0]
 
 
-def convert_op_hints_to_stubs(session):
+def _tensorflow_output_name(tensor_name, output_index):
+  return tensor_name if output_index == 0 else "%s:%d" % (tensor_name,
+                                                          output_index)
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
+                           name_to_input_name):
+  """Checks to make sure node only connects to predecessor graph through inputs.
+
+  Args:
+    n: Node to check
+    reachable_by_input: Nodes that are reachable by all inputs of subgraph
+    input_nodes_set: The set of nodes that are "inputs".
+    name_to_input_name: Maps from name to the list of inputs.
+
+  Raises:
+    TypeError: If the given node uses items past inputs directly.
+  """
+  next_to_visit = [n]
+  visited = set()
+  while next_to_visit:
+    current_node = next_to_visit.pop()
+    visited.add(current_node)
+    if (current_node in reachable_by_input
+        and current_node not in input_nodes_set):
+      raise TypeError(
+          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if current_node not in input_nodes_set:
+      next_to_visit += [
+          input_node for input_node in name_to_input_name[current_node]
+          if input_node not in visited
+      ]
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _convert_single_op_hint_to_stub(call, graph_def):
+  """Given a graph_def, converts `call` into a stub and returns a new graph_def.
+
+  Args:
+    call: A single function call to be converted.
+    graph_def: A graph_def to use as input (that hass call obviously).
+  Returns:
+    A new transformed graph-def that has call as a stub (single op).
+
+  Note: after this process, the graph_def can no longer be loaded into
+      the tensorflow runtime, so all future manipulations are done in graph_def
+      level.
+  """
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      graph_def)
+  input_names, output_names = call.flattened_inputs_and_outputs()
+
+  reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
+  reachable_by_output = _bfs_for_reachable_nodes(output_names,
+                                                 name_to_input_name)
+  input_nodes_set = set(input_names)
+  output_nodes_set = set(output_names)
+  nodes_after_fuse = []
+  nodes_deleted_by_fuse = set()
+  # Classify each node. We want to keep everything reachable by input, but
+  # we don't know if things that are not reachable by output or input (things
+  # after fusing).
+  for node in graph_def.node:
+    n = _tensor_name_base(node.name)
+    if n in reachable_by_output:
+      if n not in reachable_by_input and n not in output_nodes_set:
+        # n is an internal node. Check to make sure it is really internal.
+        # TODO(aselle): this could be done more efficiently by flooding
+        # the graph first.
+        _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
+                               name_to_input_name)
+        nodes_deleted_by_fuse.add(n)
+    elif n not in reachable_by_input:
+      # n is a node that after all the fusings, so keep it.
+      nodes_after_fuse.append(n)
+    else:
+      # n is a node that is randomly in the graph but not connected to
+      # the chain of dependencies.
+      pass
+
+  # Make a new graphdef with all the pre-input and input nodes
+  out = _graph_pb2.GraphDef()
+  reachable_by_input_sorted = sorted(
+      list(reachable_by_input), key=lambda n: name_to_seq_num[n])
+  for node in reachable_by_input_sorted:
+    out.node.extend([_copy.deepcopy(name_to_node[node])])
+
+  # Create any stacks to aggregate arguments into to a single input
+  # i.e. for static_rnn's.
+  # TODO(aselle): Check that the inputs are complete i.e. 0 to n-1
+  sorted_input_indices = list(call.inputs.keys())
+  sorted_input_indices.sort()
+  sorted_output_indices = list(call.outputs.keys())
+  sorted_output_indices.sort()
+  new_node = _node_def_pb2.NodeDef()
+  # Delegate to each operand to produce the proper new input for this stub node.
+  # In particular, an aggregate input will now be a Pack of some previously
+  # non-fused things.
+  for input_index in sorted_input_indices:
+    inputs = call.inputs[input_index]
+    new_node.input.append(inputs.aggregate_and_return_name_for_input(out))
+  new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
+
+  # Ceate the function
+  new_node.op = call.function_name
+  new_node.name = call.uuid
+  out.node.extend([new_node])
+
+  # Now call each output argument to give them a chance to make the proper
+  # output type and add it to our new_node.
+  output_dtypes = []
+  for output_index in sorted_output_indices:
+    output = call.outputs[output_index]
+    output_dtype = (
+        output.aggregate_and_return_name_for_output(new_node.name, output_index,
+                                                    out))
+    output_dtypes.append(output_dtype)
+  new_node.attr["_output_types"].list.type[:] = output_dtypes
+  # TODO(aselle): what is right here?
+  new_node.attr["_output_quantized"].b = False
+
+  # Add post output nodes that do not depend on the outputs
+  for n in nodes_after_fuse:
+    should_keep = True
+    for input_name in name_to_input_name[n]:
+      if input_name in nodes_deleted_by_fuse:
+        should_keep = False
+    if should_keep:
+      out.node.extend([_copy.deepcopy(name_to_node[n])])
+
+  # Misc. graph_def data that needs copying.
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
+
+  return out
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _remove_one_redundant_stack_unstack(in_graph_def):
+  """Removes a stack->unstack pattern from in_graph_def in a returned graph.
+
+  Args:
+    in_graph_def: Graph def to use as input.
+  Returns:
+    Simplified tuple (graph_def, changed_something) where changed_something
+    is true if anything was done.
+  """
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      in_graph_def)
+  del name_to_seq_num
+
+  # TODO(aselle): Make this not hardcoded.
+  do_generic_pack_unpack = True
+
+  out = _graph_pb2.GraphDef()
+  out.library.CopyFrom(in_graph_def.library)
+  out.versions.CopyFrom(in_graph_def.versions)
+  for n in in_graph_def.node:
+    node_name = _tensor_name_base(n.name)
+    if not node_name.startswith("OpHintStack") and not n.op.startswith("Pack"):
+      continue
+    next_to_visit = [node_name]
+    visited = set()
+
+    unpack_nodes = set()
+    pack_node = node_name
+
+    # Find a pattern of unstack connected to a stack (with identities
+    # in between.
+    matches_pattern = True
+    is_hint_created_stack = False
+    while next_to_visit:
+      current_node_name = next_to_visit[0]
+      visited.add(current_node_name)
+      del next_to_visit[0]
+      node = name_to_node[current_node_name]
+      is_op_hint_stack = node.name.startswith("OpHintStack")
+      is_op_hint_unstack = node.name.startswith("OpHintUnstack")
+      if (node.op == "Identity" or is_op_hint_stack
+          or (do_generic_pack_unpack and node.op == "Pack")):
+        is_hint_created_stack |= is_op_hint_stack
+        next_to_visit += [
+            input_node for input_node in name_to_input_name[current_node_name]
+            if input_node not in visited
+        ]
+      elif (is_op_hint_unstack
+            or (do_generic_pack_unpack and node.op == "Unpack")):
+        unpack_nodes.add(node.name)
+        is_hint_created_stack &= is_op_hint_unstack
+      else:
+        matches_pattern = False
+        break
+      visited.add(node.name)
+
+    if matches_pattern and len(unpack_nodes) == 1:
+      pack_node = node_name
+
+      # Check to see if anyone depends on the intermediate identity or the
+      # Unstacked form
+      no_external_dependency = True
+      for other_n in in_graph_def.node:
+        if other_n.name in visited: continue
+        for input_tensor in name_to_input_name[other_n.name]:
+          input_op = _tensor_name_base(input_tensor)
+          if input_op in visited and input_op != pack_node:
+            no_external_dependency = False
+      # Proceed with the substitution if the stack/unstack pair was created
+      # through hints, or that it was not, but nobody is consuming things
+      # between the stack and unstack.
+      if is_hint_created_stack or no_external_dependency:
+        end = unpack_nodes.pop()
+        end_input = name_to_node[end].input[0]
+        # All nodes that depend on the final stack need to be redone to use
+        for other_n in in_graph_def.node:
+          node_name = _tensor_name_base(other_n.name)
+          if node_name not in visited:
+            new_node = _copy.deepcopy(other_n)
+            new_node.input[:] = [
+                (end_input if stripped == pack_node else
+                 non_stripped) for stripped, non_stripped in zip(
+                     name_to_input_name[node_name], new_node.input[:])
+            ]
+            out.node.extend([new_node])
+        return out, True
+  return in_graph_def, False
+
+
+def _remove_redundant_stack_unstack(graph_def):
+  curr = graph_def
+  del graph_def
+  changed_stuff = True
+  while changed_stuff:
+    curr, changed_stuff = _remove_one_redundant_stack_unstack(curr)
+  return curr
+
+
+def _convert_op_hints_to_stubs_helper(
+    graph_def, write_callback=lambda sess, graph_def: None):
+  """Converts a graph_def to a new graph_def where all op hints are stubbed.
+
+  Args:
+    graph_def: A graph def that we should convert.
+    write_callback: A function pointer that can be used to write intermediate
+      steps of graph transformation (optional).
+  Returns:
+    A new stubbed graph_def.
+  """
+
+  hints = _find_all_hints_in_graph_def(graph_def)
+  curr_graph_def = graph_def
+  del graph_def  # prevent using graph_def again (common source of error)
+  for hint in _six.itervalues(hints):
+    curr_graph_def = _convert_single_op_hint_to_stub(
+        hint, curr_graph_def)
+    write_callback(curr_graph_def, "initial")
+  # The stubbing process can create stacks/unstacks in the case of LSTMs
+  # remove them.
+  curr_graph_def = _remove_redundant_stack_unstack(curr_graph_def)
+  return curr_graph_def
+
+
+def convert_op_hints_to_stubs(session=None,
+                              graph_def=None,
+                              write_callback=lambda graph_def, comments: None):
   """Converts a graphdef with LiteOp hints into stub operations.
 
   This is used to prepare for toco conversion of complex intrinsic usages.
+  Note: only one of session or graph_def should be used, not both.
 
   Args:
     session: A TensorFlow session that contains the graph to convert.
+    graph_def: A graph def that we should convert.
+    write_callback: A function pointer that can be used to write intermediate
+      steps of graph transformation (optional).
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.
+  Raises:
+    ValueError: If both session and graph_def are provided.
   """
-  hints = _find_all_hints_in_graph_def(session)
-  current_graph_def = session.graph_def
-  for call in hints.values():
-    input_names = [None] * len(call.inputs)
-    output_names = [None] * len(call.outputs)
-    output_dtypes = [None] * len(call.outputs)
-    output_quantized = False
-    for input_index, tensor in call.inputs.items():
-      input_names[input_index] = _tensor_name_base(tensor)
-    for output_index, tensor in call.outputs.items():
-      output_names[output_index] = _tensor_name_base(tensor)
-      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
-    # TODO(aselle): Support quantized flag properly
-    current_graph_def = _framework.fuse_op(
-        current_graph_def, input_names, output_names, output_dtypes,
-        output_quantized, call.uuid, call.function_name)
-    for node in current_graph_def.node:
-      if node.name == call.uuid:
-        for param, tensor in call.params.items():
-          node.attr[param].tensor.CopyFrom(tensor)
-  return current_graph_def
+
+  if session is not None and graph_def is not None:
+    raise ValueError("Provide only one of session and graph_def.")
+
+  if session is not None:
+    return _convert_op_hints_to_stubs_helper(session.graph_def, write_callback)
+  elif graph_def is not None:
+    return _convert_op_hints_to_stubs_helper(graph_def, write_callback)
+  else:
+    raise ValueError("Must specify session or graph_def as input.")
 
 
-_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+_allowed_symbols = [
+    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new"
+]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index d17482e..a76cc39 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -203,8 +203,9 @@
     raise ValueError("--default_ranges_min and --default_ranges_max must be "
                      "used together")
 
-  if flags.dump_graphviz_video and not flags.dump_graphviz:
-    raise ValueError("--dump_graphviz_video must be used with --dump_graphviz")
+  if flags.dump_graphviz_video and not flags.dump_graphviz_dir:
+    raise ValueError("--dump_graphviz_video must be used with "
+                     "--dump_graphviz_dir")
 
 
 def run_main(_):
diff --git a/tensorflow/contrib/lite/rpi_makefile.inc b/tensorflow/contrib/lite/rpi_makefile.inc
deleted file mode 100644
index 832ef58..0000000
--- a/tensorflow/contrib/lite/rpi_makefile.inc
+++ /dev/null
@@ -1,33 +0,0 @@
-# Settings for Raspberry Pi.
-ifeq ($(TARGET), RPI)
-	ifeq ($(TARGET_ARCH), armv7)
-		CXXFLAGS += \
-			-march=armv7-a \
-			-mfpu=neon-vfpv4 \
-			-funsafe-math-optimizations \
-			-ftree-vectorize
-
-		CCFLAGS += \
-			-march=armv7-a \
-			-mfpu=neon-vfpv4 \
-			-funsafe-math-optimizations \
-			-ftree-vectorize
-
-		LDFLAGS := \
-			-Wl,--no-export-dynamic \
-			-Wl,--exclude-libs,ALL \
-			-Wl,--gc-sections \
-			-Wl,--as-needed
-	endif
-
-	LIBS := \
-	-lstdc++ \
-	-lpthread \
-	-lm \
-	-ldl
-
-	OBJDIR := $(OBJDIR)rpi_$(TARGET_ARCH)/
-	LIBDIR := $(LIBDIR)rpi_$(TARGET_ARCH)/
-	BINDIR := $(BINDIR)rpi_$(TARGET_ARCH)/
-	DEPDIR := $(DEPDIR)rpi_$(TARGET_ARCH)/
-endif
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index b616e44..28a7e50 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -48,7 +48,7 @@
     "schema_v3.fbs",
 ])
 
-load("//third_party/flatbuffers:build_defs.bzl", "flatbuffer_cc_library")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 
 # Generic schema for inference on device.
 flatbuffer_cc_library(
diff --git a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
index cd46a06..4af6925 100644
--- a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
+++ b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
@@ -15,7 +15,7 @@
 
 #include <fstream>
 #include <gtest/gtest.h>
-#include "flatbuffers/flatc.h"
+#include "include/flatbuffers/flatc.h"  // flatbuffers
 #include "tensorflow/core/platform/platform.h"
 
 #ifdef PLATFORM_GOOGLE
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema.py b/tensorflow/contrib/lite/schema/upgrade_schema.py
index e0b36d3..a2ddf62 100644
--- a/tensorflow/contrib/lite/schema/upgrade_schema.py
+++ b/tensorflow/contrib/lite/schema/upgrade_schema.py
@@ -99,9 +99,9 @@
     # dispatch function table.
     self._schemas.sort()
     self._new_version, self._new_schema = self._schemas[-1][:2]
-    self._upgrade_dispatch = dict(
-        (version, dispatch)
-        for version, unused1, unused2, dispatch in self._schemas)
+    self._upgrade_dispatch = {
+        version: dispatch
+        for version, unused1, unused2, dispatch in self._schemas}
 
   def _Read(self, input_file, schema, raw_binary=False):
     """Read a tflite model assuming the given flatbuffer schema.
diff --git a/tensorflow/contrib/lite/string.h b/tensorflow/contrib/lite/string.h
index 7f8f4e8..af3fadf 100644
--- a/tensorflow/contrib/lite/string.h
+++ b/tensorflow/contrib/lite/string.h
@@ -13,8 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 // Abstract string. We don't want even absl at this level.
-#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
-#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_STRING_H_
+#define TENSORFLOW_CONTRIB_LITE_STRING_H_
 
 #include <string>
 
@@ -26,4 +26,4 @@
 
 }  // namespace tflite
 
-#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_STRING_H_
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index a788d41..89912fd 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -162,11 +162,12 @@
         ":test_runner",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/delegates/eager:delegate",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "tflite_driver_test",
     size = "small",
     srcs = ["tflite_driver_test.cc"],
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 3d1f8c0..597ee8f 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -90,8 +90,6 @@
 # matching the expression will be considered due to the corresponding bug.
 KNOWN_BUGS = {
     # TOCO doesn't support scalars as input.
-    r"relu.*input_shape=\[\]": "67587484",
-    r"sigmoid.*input_shape=\[\]": "67645668",
     # Concat doesn't work with a single input tensor
     r"concat.*num_tensors=1": "67378344",
     # Transposition in MatMul is not fully supported.
@@ -104,8 +102,6 @@
     r"div.*int32": "72051395",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
-    # Scalar constants don't work.
-    r"constant.*shape=\[\]": "109811500",
 }
 
 
@@ -230,6 +226,7 @@
     tf.float16: (np.float16, "FLOAT"),
     tf.int32: (np.int32, "INT32"),
     tf.uint8: (np.uint8, "QUANTIZED_UINT8"),
+    tf.int16: (np.int16, "QUANTIZED_INT16"),
     tf.int64: (np.int64, "INT64"),
     tf.bool: (np.bool, "BOOL"),
 }
@@ -243,7 +240,7 @@
 
   if dtype in (tf.float32, tf.float16):
     value = (max_value-min_value)*np.random.random_sample(shape)+min_value
-  elif dtype in (tf.int32, tf.uint8, tf.int64):
+  elif dtype in (tf.int32, tf.uint8, tf.int64, tf.int16):
     value = np.random.randint(min_value, max_value+1, shape)
   elif dtype == tf.bool:
     value = np.random.choice([True, False], size=shape)
@@ -259,7 +256,7 @@
 
   if dtype in (tf.float32, tf.float16):
     value = (max_value - min_value) * np.random.random() + min_value
-  elif dtype in (tf.int32, tf.uint8, tf.int64):
+  elif dtype in (tf.int32, tf.uint8, tf.int64, tf.int16):
     value = np.random.randint(min_value, max_value + 1)
   return np.array(value, dtype=dtype)
 
@@ -824,11 +821,13 @@
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_reduce_tests(reduce_op):
+def make_reduce_tests(reduce_op, min_value=-10, max_value=10):
   """Make a set of tests to do reduce operation.
 
   Args:
     reduce_op: TensorFlow reduce operation to test, i.e. `tf.reduce_mean`.
+    min_value: min value for created tensor data.
+    max_value: max value for created tensor data.
 
   Returns:
     a function representing the true generator with `reduce_op_in` curried.
@@ -891,10 +890,12 @@
 
     def build_inputs(parameters, sess, inputs, outputs):
       values = [
-          create_tensor_data(parameters["input_dtype"],
-                             parameters["input_shape"],
-                             min_value=-10,
-                             max_value=10)]
+          create_tensor_data(
+              parameters["input_dtype"],
+              parameters["input_shape"],
+              min_value=min_value,
+              max_value=max_value)
+      ]
       if not parameters["const_axis"]:
         values.append(np.array(parameters["axis"]))
       return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
@@ -916,7 +917,8 @@
 
 def make_reduce_prod_tests(zip_path):
   """Make a set of tests to do prod."""
-  return make_reduce_tests(tf.reduce_prod)(zip_path)
+  # set min max value to be -2, 2 to avoid overflow.
+  return make_reduce_tests(tf.reduce_prod, -2, 2)(zip_path)
 
 
 def make_reduce_max_tests(zip_path):
@@ -1253,6 +1255,140 @@
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+# Note: This is a regression test for a bug (b/112436267) that Toco incorrectly
+# fuses weights when multiple Conv2D/FULLY_CONNECTED ops share the same constant
+# weight tensor.
+def make_conv_with_shared_weights_tests(zip_path):
+  """Make a test where 2 Conv ops shared the same constant weight tensor."""
+
+  test_parameters = [{
+      "input_shape": [[1, 10, 10, 3]],
+      "filter_shape": [[3, 3]],
+      "strides": [[1, 1, 1, 1]],
+      "dilations": [[1, 1, 1, 1]],
+      "padding": ["SAME"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [1],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    # Construct a constant weights tensor which will be used by both Conv2D.
+    filter_tensor = tf.constant(
+        create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
+    input_tensors = [input_tensor]
+
+    # Construct 2 Conv2D operations which use exactly the same input and
+    # weights.
+    result1 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    result2 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    # Add MUL ops after Conv2D ops. These MUL ops should be fused into the
+    # weights of Conv2D.
+    result1 = result1 * 2
+    result2 = result2 * 3
+    # Add the 2 results up.
+    out = result1 + result2
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, unused_filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+# Note: This is a regression test for a bug (b/112303004) that Toco incorrectly
+# transforms Conv into DepthwiseConv when two Conv ops share the same constant
+# weight tensor.
+def make_conv_to_depthwiseconv_with_shared_weights_tests(zip_path):
+  """Make a test where 2 Conv ops shared the same constant weight tensor."""
+
+  test_parameters = [{
+      "input_shape": [[1, 10, 10, 1]],
+      "filter_shape": [[3, 3]],
+      "strides": [[1, 1, 1, 1]],
+      "dilations": [[1, 1, 1, 1]],
+      "padding": ["SAME"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [3],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    # Construct a constant weights tensor which will be used by both Conv2D.
+    filter_tensor = tf.constant(
+        create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
+    input_tensors = [input_tensor]
+
+    # Construct 2 Conv2D operations which use exactly the same input and
+    # weights.
+    result1 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    result2 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    # Add the 2 results up.
+    out = result1 + result2
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, unused_filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_depthwiseconv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
@@ -1355,6 +1491,7 @@
       "base_shape": [[1, 3, 4, 3], [3, 4]],
       "num_tensors": [1, 2, 3, 4, 5, 6],
       "axis": [0, 1, 2, 3, -3, -2, -1],
+      "type": [tf.float32, tf.uint8, tf.int32, tf.int64],
   }]
 
   def get_shape(parameters, delta):
@@ -1370,7 +1507,8 @@
   def build_graph(parameters):
     all_tensors = []
     for n in range(0, parameters["num_tensors"]):
-      input_tensor = tf.placeholder(dtype=tf.float32, name=("input%d" % n),
+      input_tensor = tf.placeholder(dtype=parameters["type"],
+                                    name=("input%d" % n),
                                     shape=get_shape(parameters, n))
       all_tensors.append(input_tensor)
     out = tf.concat(all_tensors, parameters["axis"])
@@ -1379,8 +1517,8 @@
   def build_inputs(parameters, sess, inputs, outputs):
     all_values = []
     for n in range(0, parameters["num_tensors"]):
-      input_values = create_tensor_data(np.float32,
-                                        get_shape(parameters, n))
+      input_values = create_tensor_data(
+          parameters["type"], get_shape(parameters, n))
       all_values.append(input_values)
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
@@ -1669,7 +1807,7 @@
   }]
 
   def build_graph(parameters):
-    """Build the topk op testing graph."""
+    """Build the shape op testing graph."""
     # Note that we intentionally leave out the shape from the input placeholder
     # to prevent the Shape operation from being optimized out during conversion.
     input_value = tf.placeholder(dtype=parameters["input_dtype"], name="input")
@@ -2317,6 +2455,7 @@
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
       "input_shape": [[10], [5, 20]],
+      "input_k": [None, 1, 3],
   }]
 
   def build_graph(parameters):
@@ -2325,15 +2464,23 @@
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    k = tf.constant(3, name="k")
+    if parameters["input_k"] is not None:
+      k = tf.placeholder(dtype=tf.int32, name="input_k", shape=[])
+    else:
+      k = tf.constant(3, name="k")
     out = tf.nn.top_k(input_value, k)
-    return [input_value], [out[1]]
+    return [input_value, k], [out[1]]
 
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
-    return [input_value], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_value])))
+    if parameters["input_k"] is not None:
+      k = np.array(parameters["input_k"], dtype=np.int32)
+      return [input_value, k], sess.run(
+          outputs, feed_dict=dict(zip(inputs, [input_value, k])))
+    else:
+      return [input_value], sess.run(
+          outputs, feed_dict=dict(zip(inputs, [input_value])))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index f29c188..62cbecc 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -114,7 +114,13 @@
     // different set.
     std::vector<string> input_values =
         GenerateInputValues(input_layer, input_layer_type, input_layer_shape);
-    if (input_values.empty()) return false;
+    if (input_values.empty()) {
+      std::cerr << "Unable to generate input values for the TensorFlow model. "
+                   "Make sure the correct values are defined for "
+                   "input_layer, input_layer_type, and input_layer_shape."
+                << std::endl;
+      return false;
+    }
 
     // Run TensorFlow.
     for (int j = 0; j < input_values.size(); j++) {
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e475f25..e67fee2 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -33,13 +33,18 @@
 
 namespace {
 bool FLAGS_ignore_known_bugs = true;
-// TODO(b/71769302) zip_files_dir should have a more accurate default, if
-// possible
-string* FLAGS_zip_file_path = new string("./");
+// As archive file names are test-specific, no default is possible.
+//
+// This test supports input as both zip and tar, as a stock android image does
+// not have unzip but does have tar.
+string* FLAGS_zip_file_path = new string;
+string* FLAGS_tar_file_path = new string;
 #ifndef __ANDROID__
 string* FLAGS_unzip_binary_path = new string("/usr/bin/unzip");
+string* FLAGS_tar_binary_path = new string("/bin/tar");
 #else
 string* FLAGS_unzip_binary_path = new string("/system/bin/unzip");
+string* FLAGS_tar_binary_path = new string("/system/bin/tar");
 #endif
 bool FLAGS_use_nnapi = false;
 bool FLAGS_ignore_unsupported_nnapi = false;
@@ -98,11 +103,11 @@
      "77546240"},
 };
 
-// Allows test data to be unzipped into a temporary directory and makes
+// Allows test data to be unarchived into a temporary directory and makes
 // sure those temporary directories are removed later.
-class ZipEnvironment : public ::testing::Environment {
+class ArchiveEnvironment : public ::testing::Environment {
  public:
-  ~ZipEnvironment() override {}
+  ~ArchiveEnvironment() override {}
 
   // Delete all temporary directories on teardown.
   void TearDown() override {
@@ -114,15 +119,26 @@
     temporary_directories_.clear();
   }
 
-  // Unzip `zip` file into a new temporary directory  `out_dir`.
-  tensorflow::Status UnZip(const string& zip, string* out_dir) {
+  // Unarchive `archive` file into a new temporary directory  `out_dir`.
+  tensorflow::Status UnArchive(const string& zip, const string& tar,
+                               string* out_dir) {
     string dir;
     TF_CHECK_OK(MakeTemporaryDirectory(&dir));
     tensorflow::SubProcess proc;
-    string unzip_binary = *FLAGS_unzip_binary_path;
-    TF_CHECK_OK(env->FileExists(unzip_binary));
-    TF_CHECK_OK(env->FileExists(zip));
-    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip});
+    if (!zip.empty()) {
+      string unzip_binary = *FLAGS_unzip_binary_path;
+      TF_CHECK_OK(env->FileExists(unzip_binary));
+      TF_CHECK_OK(env->FileExists(zip));
+      proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip});
+    } else {
+      string tar_binary = *FLAGS_tar_binary_path;
+      TF_CHECK_OK(env->FileExists(tar_binary));
+      TF_CHECK_OK(env->FileExists(tar));
+      // 'o' needs to be explicitly set on Android so that
+      // untarring works as non-root (otherwise tries to chown
+      // files, which fails)
+      proc.SetProgram(tar_binary, {"tar", "xfo", tar, "-C", dir});
+    }
     proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
     proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
     if (!proc.Start())
@@ -156,15 +172,15 @@
   std::vector<string> temporary_directories_;
 };
 
-// Return the singleton zip_environment.
-ZipEnvironment* zip_environment() {
-  static ZipEnvironment* env = new ZipEnvironment;
+// Return the singleton archive_environment.
+ArchiveEnvironment* archive_environment() {
+  static ArchiveEnvironment* env = new ArchiveEnvironment;
   return env;
 }
 
-// Read the manifest.txt out of the unarchived zip file. Specifically
+// Read the manifest.txt out of the unarchived archive file. Specifically
 // `original_file` is the original zip file for error messages. `dir` is
-// the temporary directory where the zip file has been unarchived and
+// the temporary directory where the archive file has been unarchived and
 // `test_paths` is the list of test prefixes that were in the manifest.
 // Note, it is an error for a manifest to contain no tests.
 tensorflow::Status ReadManifest(const string& original_file, const string& dir,
@@ -190,12 +206,22 @@
   return tensorflow::Status::OK();
 }
 
-// Get a list of tests from a zip file `zip_file_name`.
-std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file) {
+// Get a list of tests from either zip or tar file
+std::vector<string> UnarchiveAndFindTestNames(const string& zip_file,
+                                              const string& tar_file) {
+  if (zip_file.empty() && tar_file.empty()) {
+    TF_CHECK_OK(tensorflow::Status(tensorflow::error::UNKNOWN,
+                                   "Neither zip_file nor tar_file was given"));
+  }
   string decompress_tmp_dir;
-  TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
+  TF_CHECK_OK(archive_environment()->UnArchive(zip_file, tar_file,
+                                               &decompress_tmp_dir));
   std::vector<string> stuff;
-  TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff));
+  if (!zip_file.empty()) {
+    TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff));
+  } else {
+    TF_CHECK_OK(ReadManifest(tar_file, decompress_tmp_dir, &stuff));
+  }
   return stuff;
 }
 
@@ -223,8 +249,7 @@
   string message = test_driver.GetErrorMessage();
   if (bug_number.empty()) {
     if (FLAGS_use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) {
-      EXPECT_EQ(message, string("Failed to invoke NNAPI interpreter"))
-          << message;
+      EXPECT_EQ(message, string("Failed to invoke interpreter")) << message;
     } else {
       EXPECT_TRUE(result) << message;
     }
@@ -256,27 +281,34 @@
   }
 };
 
-INSTANTIATE_TEST_CASE_P(
-    tests, OpsTest,
-    ::testing::ValuesIn(UnarchiveZipAndFindTestNames(*FLAGS_zip_file_path)),
-    ZipPathParamName());
+INSTANTIATE_TEST_CASE_P(tests, OpsTest,
+                        ::testing::ValuesIn(UnarchiveAndFindTestNames(
+                            *FLAGS_zip_file_path, *FLAGS_tar_file_path)),
+                        ZipPathParamName());
 
 }  // namespace testing
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  ::testing::AddGlobalTestEnvironment(tflite::testing::zip_environment());
+  ::testing::AddGlobalTestEnvironment(tflite::testing::archive_environment());
 
   std::vector<tensorflow::Flag> flags = {
       tensorflow::Flag(
           "ignore_known_bugs", &tflite::testing::FLAGS_ignore_known_bugs,
           "If a particular model is affected by a known bug, the "
           "corresponding test should expect the outputs to not match."),
-      tensorflow::Flag("zip_file_path", tflite::testing::FLAGS_zip_file_path,
-                       "Required: Location of the test zip file."),
+      tensorflow::Flag(
+          "tar_file_path", tflite::testing::FLAGS_tar_file_path,
+          "Required (or zip_file_path): Location of the test tar file."),
+      tensorflow::Flag(
+          "zip_file_path", tflite::testing::FLAGS_zip_file_path,
+          "Required (or tar_file_path): Location of the test zip file."),
       tensorflow::Flag("unzip_binary_path",
                        tflite::testing::FLAGS_unzip_binary_path,
-                       "Required: Location of a suitable unzip binary."),
+                       "Location of a suitable unzip binary."),
+      tensorflow::Flag("tar_binary_path",
+                       tflite::testing::FLAGS_tar_binary_path,
+                       "Location of a suitable tar binary."),
       tensorflow::Flag("use_nnapi", &tflite::testing::FLAGS_use_nnapi,
                        "Whether to enable the NNAPI delegate"),
       tensorflow::Flag("ignore_unsupported_nnapi",
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index ec435ca..30381ba 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -179,7 +179,9 @@
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate("Failed to run input data on graph");
+    Invalidate(
+        "Failed to run input data on graph. Make sure the correct value is "
+        "defined for the input and output arrays.");
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 695c2a3..3874bc3 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -33,6 +33,7 @@
     string input_layer_shape;
     string output_layer;
     int32_t num_runs_per_pass = 100;
+    string delegate;
   } values;
 
   std::vector<tensorflow::Flag> flags = {
@@ -42,18 +43,21 @@
                        "Path of tensorflow lite model."),
       tensorflow::Flag("input_layer", &values.input_layer,
                        "Names of input tensors, separated by comma. Example: "
-                       "input_1,input_2"),
+                       "input_1,input_2."),
       tensorflow::Flag("input_layer_type", &values.input_layer_type,
                        "Data types of input tensors, separated by comma. "
-                       "Example: float,int"),
+                       "Example: float,int."),
       tensorflow::Flag(
           "input_layer_shape", &values.input_layer_shape,
-          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2"),
+          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2."),
       tensorflow::Flag("output_layer", &values.output_layer,
-                       "Names of output tensors, separated by comma. Example "
-                       "output_1,output_2"),
+                       "Names of output tensors, separated by comma. Example: "
+                       "output_1,output_2."),
       tensorflow::Flag("num_runs_per_pass", &values.num_runs_per_pass,
-                       "Number of full runs in each pass."),
+                       "[optional] Number of full runs in each pass."),
+      tensorflow::Flag("delegate", &values.delegate,
+                       "[optional] Delegate to use for executing ops. Must be "
+                       "`{\"\", EAGER}`"),
   };
 
   bool no_inputs = *argc == 1;
@@ -61,6 +65,14 @@
   if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return {};
+  } else if (values.tensorflow_model.empty() || values.tflite_model.empty() ||
+             values.input_layer.empty() || values.input_layer_type.empty() ||
+             values.input_layer_shape.empty() || values.output_layer.empty()) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
+  } else if (!(values.delegate == "" || values.delegate == "EAGER")) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
   }
 
   return {values.tensorflow_model,
@@ -69,7 +81,8 @@
           Split<string>(values.input_layer_type, ","),
           Split<string>(values.input_layer_shape, ":"),
           Split<string>(values.output_layer, ","),
-          values.num_runs_per_pass};
+          values.num_runs_per_pass,
+          values.delegate};
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
index 19f34c0..c6ca796 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -33,7 +33,7 @@
           options.input_layer_shape, options.output_layer)) {
     return false;
   }
-  TfLiteDriver tflite_driver(/*use_nnapi=*/true);
+  TfLiteDriver tflite_driver(/*use_nnapi=*/true, options.delegate);
   tflite_driver.LoadModel(options.tflite_model);
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.h b/tensorflow/contrib/lite/testing/tflite_diff_util.h
index 4ab2f23..f679921 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.h
@@ -44,6 +44,9 @@
   // each of the passes. The first pass has a single inference, while the
   // second pass does multiple inferences back to back.
   int num_runs_per_pass;
+  // Path to the delegate library to be loaded in order to execute ops. Must be
+  // `{"", EAGER}`.
+  string delegate;
 };
 
 // Run a single TensorFLow Lite diff test with a given options.
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 4d08fb5..4dacf9c 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
 #include "tensorflow/contrib/lite/testing/split.h"
 
 namespace tflite {
@@ -135,7 +136,13 @@
   size_t num_elements_;
 };
 
-TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
+TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
+    : use_nnapi_(use_nnapi) {
+  if (delegate_name == "EAGER") {
+    delegate_ = EagerDelegate::Create();
+  }
+}
+
 TfLiteDriver::~TfLiteDriver() {}
 
 void TfLiteDriver::AllocateTensors() {
@@ -165,6 +172,15 @@
   }
   interpreter_->UseNNAPI(use_nnapi_);
 
+  if (delegate_) {
+    if (interpreter_->ModifyGraphWithDelegate(delegate_.get(),
+                                              /*allow_dynamic_tensors=*/true) !=
+        kTfLiteOk) {
+      Invalidate("Unable to the build graph using the delegate");
+      return;
+    }
+  }
+
   must_allocate_tensors_ = true;
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index 5493ba3..aed35f8 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -17,6 +17,7 @@
 
 #include <map>
 
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -28,7 +29,7 @@
 // A test runner that feeds inputs into TF Lite and verifies its outputs.
 class TfLiteDriver : public TestRunner {
  public:
-  explicit TfLiteDriver(bool use_nnapi);
+  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
   ~TfLiteDriver() override;
 
   void LoadModel(const string& bin_file_path) override;
@@ -52,6 +53,7 @@
 
   class Expectation;
 
+  std::unique_ptr<EagerDelegate> delegate_;
   bool use_nnapi_ = false;
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index c880797..02d0890 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -11,6 +11,7 @@
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
+    "tf_copts",
 )
 
 tf_proto_library_cc(
@@ -241,9 +242,11 @@
         "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_reshape.cc",
+        "graph_transformations/resolve_constant_select.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_slice.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
+        "graph_transformations/resolve_constant_tile.cc",
         "graph_transformations/resolve_constant_transpose.cc",
         "graph_transformations/resolve_constant_unary.cc",
         "graph_transformations/resolve_fake_quant_args_from_vars.cc",
@@ -305,7 +308,7 @@
         "tensorflow_util.h",
         "toco_tooling.h",
     ],
-    copts = select({
+    copts = tf_copts() + select({
         "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
         "//conditions:default": [],
     }),
@@ -360,6 +363,7 @@
         "dump_graphviz.h",
         "tooling_util.h",
     ],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":model",
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 1f3ea2e..18c904c 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -106,6 +106,17 @@
 
   // Core allocation routine.
   void Allocate(std::size_t size, Alloc* result) {
+    if (size == 0) {
+      // zero-sized arrays get a dummy alloc of (0, 0) that does not
+      // need to be kept in the books (no need to insert that into
+      // live_allocs_).
+      // Note: zero-sized arrays shouldn't exist, but handling that case
+      // here allows such pathological cases to get a cleaner error message
+      // later instead of generating spurious allocator failures.
+      result->start = 0;
+      result->end = 0;
+      return;
+    }
     // Naive algorithm: pick the first gap between live allocations,
     // that is wide enough for the new array.
     std::size_t pos = 0;
@@ -128,6 +139,11 @@
   }
 
   void Deallocate(const Alloc& a) {
+    // Special-case dummy allocs for zero-sized arrays.
+    if (a.start == 0 && a.end == 0) {
+      // Nothing needs to be done, these aren't kept in the books.
+      return;
+    }
     auto iter = std::lower_bound(live_allocs_.begin(), live_allocs_.end(), a);
     CHECK(iter != live_allocs_.end());
     CHECK(*iter == a);
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index 6877fb2..30525ef 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -167,7 +167,7 @@
     node_properties.label += "]";
 
     int buffer_size = 0;
-    if (IsValid(array.shape())) {
+    if (IsNonEmpty(array.shape())) {
       buffer_size = RequiredBufferSizeForShape(array.shape());
       node_properties.log2_buffer_size =
           std::log2(static_cast<float>(buffer_size));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index 1ea83ab..e88839b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -48,7 +48,17 @@
     // dimension.
     return false;
   }
-  auto& weights_array = model->GetArray(conv_op->inputs[1]);
+
+  const auto& weights_name = conv_op->inputs[1];
+  if (CountOpsWithInput(*model, weights_name) > 1) {
+    // TODO(yunluli): Come up with a way to do the weights shuffling only once.
+    AddMessageF(
+        "Not changing %s to DepthwiseConv because the weights is consumed by "
+        "another op.",
+        LogName(*conv_op));
+    return false;
+  }
+  auto& weights_array = model->GetArray(weights_name);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 76c6be0..b324631 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -274,8 +274,14 @@
     return false;
   }
 
-  const auto& weights = model->GetArray(preceding_op->inputs[1]);
-  const auto& bias = model->GetArray(preceding_op->inputs[2]);
+  const auto& weights_name = preceding_op->inputs[1];
+  const auto& bias_name = preceding_op->inputs[2];
+  const auto& weights = model->GetArray(weights_name);
+  const auto& bias = model->GetArray(bias_name);
+  const int count_ops_consuming_bias = CountOpsWithInput(*model, bias_name);
+  const int count_ops_consuming_weights =
+      CountOpsWithInput(*model, weights_name);
+
   if (binary_op->type == OperatorType::kAdd ||
       binary_op->type == OperatorType::kSub) {
     if (!bias.buffer) {
@@ -285,6 +291,13 @@
           LogName(*binary_op), LogName(*preceding_op));
       return false;
     }
+    if (count_ops_consuming_bias > 1) {
+      AddMessageF(
+          "Not fusing %s because the bias of the preceding %s is consumed by "
+          "another op",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
   } else {
     if (!weights.buffer || !bias.buffer) {
       AddMessageF(
@@ -293,6 +306,13 @@
           LogName(*binary_op), LogName(*preceding_op));
       return false;
     }
+    if (count_ops_consuming_weights > 1 || count_ops_consuming_bias > 1) {
+      AddMessageF(
+          "Not fusing %s because the weights or bias of the preceding %s is "
+          "consumed by another op",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
   }
 
   int count_ops_consuming_output =
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 8d9a4c4..99f4a7d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -190,6 +190,8 @@
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSelect)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTile)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
 DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 527013b..502de88 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -274,6 +274,19 @@
   return changed;
 }
 
+bool HardcodeMinMaxForReshape(Model* model, Operator* op) {
+  Array& input = model->GetArray(op->inputs[0]);
+  Array& output = model->GetArray(op->outputs[0]);
+
+  // If input and output both exist or do not exist, do nothing.
+  if ((!input.minmax && !output.minmax) || (input.minmax && output.minmax)) {
+    return false;
+  }
+
+  // Otherwise propagate info amongst the input and output array.
+  return PropagateMinMaxAmongArrays(model, {op->inputs[0], op->outputs[0]});
+}
+
 bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
   CHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
   CHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
@@ -370,7 +383,7 @@
     case OperatorType::kSlice:
     case OperatorType::kStridedSlice:
     case OperatorType::kSqueeze:
-    case OperatorType::kReshape:
+    case OperatorType::kExpandDims:
     case OperatorType::kPad:
     case OperatorType::kGather:
     case OperatorType::kTranspose:
@@ -415,6 +428,10 @@
       changed = HardcodeMinMaxForLstmCell(model, op);
       break;
 
+    case OperatorType::kReshape:
+      changed = HardcodeMinMaxForReshape(model, op);
+      break;
+
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 3c9379f..91e2904 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1082,27 +1082,23 @@
   }
 
   // Yield until input dims have been resolved.
-  if (!input_values.has_shape()) {
+  if (!input_values.has_shape() || !input_k.has_shape()) {
     return;
   }
 
-  const auto& input_values_shape = input_values.shape();
-  auto output_indexes_dims = output_indexes.mutable_shape()->mutable_dims();
-  auto output_values_dims = output_values.mutable_shape()->mutable_dims();
-  for (int dim = 0; dim < input_values_shape.dimensions_count() - 1; dim++) {
-    output_indexes_dims->push_back(input_values_shape.dims(dim));
-    output_values_dims->push_back(input_values_shape.dims(dim));
-  }
   // If the value is initialized, we can specify the last dimension, otherwise
   // unknown.
   if (input_k.buffer) {
+    const auto& input_values_shape = input_values.shape();
+    auto output_indexes_dims = output_indexes.mutable_shape()->mutable_dims();
+    auto output_values_dims = output_values.mutable_shape()->mutable_dims();
+    for (int dim = 0; dim < input_values_shape.dimensions_count() - 1; dim++) {
+      output_indexes_dims->push_back(input_values_shape.dims(dim));
+      output_values_dims->push_back(input_values_shape.dims(dim));
+    }
     const int32_t k_value = input_k.GetBuffer<ArrayDataType::kInt32>().data[0];
     output_indexes_dims->push_back(k_value);
     output_values_dims->push_back(k_value);
-
-  } else {
-    output_indexes_dims->push_back(0);
-    output_values_dims->push_back(0);
   }
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index b5a6554..8d22ae2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -62,7 +62,7 @@
          type == OperatorType::kLessEqual || type == OperatorType::kSelect ||
          type == OperatorType::kArgMax || type == OperatorType::kRelu ||
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
-         type == OperatorType::kShape;
+         type == OperatorType::kShape || type == OperatorType::kExpandDims;
 }
 
 // The quantized op allows output arrays of type float using
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 9f5d8b9..fc49fbd 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -48,20 +48,26 @@
 }  // namespace
 
 bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
-                                Model* model, std::size_t op_index) {
+                                Model* model, std::size_t op_index,
+                                int input_index) {
   const auto passthru_it = model->operators.begin() + op_index;
   auto* passthru_op = passthru_it->get();
   CHECK_EQ(passthru_op->outputs.size(), 1);
   CHECK_GE(passthru_op->inputs.size(), 1);
-  int count_nonconstant_input_arrays = 0;
-  // We call 'main input' the unique nonconstant input array if there is one,
-  // or else the 0-th input.
+
   int main_input_array_index = 0;
-  for (int i = 0; i < passthru_op->inputs.size(); i++) {
-    if (!model->GetArray(passthru_op->inputs[i]).buffer) {
-      count_nonconstant_input_arrays++;
-      if (count_nonconstant_input_arrays == 1) {
-        main_input_array_index = i;
+  if (input_index != -1) {
+    main_input_array_index = input_index;
+  } else {
+    // We call 'main input' the unique nonconstant input array if there is one,
+    // or else the 0-th input.
+    int count_nonconstant_input_arrays = 0;
+    for (int i = 0; i < passthru_op->inputs.size(); i++) {
+      if (!model->GetArray(passthru_op->inputs[i]).buffer) {
+        count_nonconstant_input_arrays++;
+        if (count_nonconstant_input_arrays == 1) {
+          main_input_array_index = i;
+        }
       }
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
index 9d448c3..663704e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -50,7 +50,8 @@
 // and then discards it and returns true, or, if it's not trivial (if neither
 // the input nor the output may be discarded), returns false.
 bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
-                                Model* model, std::size_t op_index);
+                                Model* model, std::size_t op_index,
+                                int input_index = -1);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index d395d7a..f5f2f77 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -117,6 +117,7 @@
                                 &quantized_max);
   if (fakequant_op->narrow_range) {
     quantized_min++;
+    output_array.narrow_range = true;
   }
 
   // It is important for matching accuracy between TF training and TFLite
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
index 41562ab..a6f665b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -100,13 +100,7 @@
 
   AddMessageF("Resolving constant reshape of %s", LogName(*op));
 
-  if (input_array.minmax) {
-    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
-  }
-  if (input_array.quantization_params) {
-    output_array.GetOrCreateQuantizationParams() =
-        input_array.GetQuantizationParams();
-  }
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
 
   // Erase input arrays if no longer used.
   for (const auto& input : op->inputs) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
new file mode 100644
index 0000000..e880a3f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Resolves a constant Select operation.
+//
+// This implementation is looking strictly for all-or-nothing on the select
+// condition. It's possible to enhance this by looking per-element and possibly
+// producing a Mul op.
+bool ResolveConstantSelect::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kSelect) {
+    return false;
+  }
+  const auto* op = static_cast<const SelectOperator*>(base_op);
+
+  CHECK_GE(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  // We require the cond input to be constant.
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    return false;
+  }
+  const Array& cond_array = model->GetArray(op->inputs[0]);
+  CHECK(cond_array.data_type == ArrayDataType::kBool)
+      << "Only bool conditions are supported";
+  const auto& cond_data = cond_array.GetBuffer<ArrayDataType::kBool>().data;
+  if (cond_data.empty()) {
+    return false;
+  }
+
+  // Check if the condition is the same for all elements.
+  bool cond_value = cond_data[0];
+  for (size_t i = 1; i < cond_data.size(); ++i) {
+    if (cond_data[i] != cond_value) {
+      AddMessageF(
+          "Cannot resolve %s as constant; cond_array has differing "
+          "per-element values",
+          LogName(*op));
+      return false;
+    }
+  }
+
+  // Pass-through the selected input.
+  return RemoveTrivialPassthroughOp(this, model, op_index, cond_value ? 1 : 2);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
new file mode 100644
index 0000000..5cfa1a5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// NOTE: the Tile implementation here is taken from tflite's Tile kernel.
+
+template <typename T>
+void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier,
+                       T* out_data) {
+  for (int i = 0; i < multiplier; ++i) {
+    const T* in_end = in_data + in_size;
+    T* new_out_data = std::copy(in_data, in_end, out_data);
+    in_data = out_data;
+    out_data = new_out_data;
+  }
+}
+
+template <typename T, typename M>
+std::pair<int, int> TileOneDimension(const Shape& in_dimensions,
+                                     const T* in_data, const M* multipliers,
+                                     T* out_data, int dimension) {
+  const int dimension_size = in_dimensions.dims(dimension);
+  if (dimension == in_dimensions.dimensions_count() - 1) {
+    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension],
+                      out_data);
+    return std::make_pair(
+        dimension_size,
+        dimension_size * static_cast<int>(multipliers[dimension]));
+  }
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  const T* copy_from_data = in_data;
+  T* copy_to_data = out_data;
+  for (int i = 0; i < dimension_size; ++i) {
+    int stride_size = 0, tiled_stride_size = 0;
+    std::tie(stride_size, tiled_stride_size) =
+        TileOneDimension(in_dimensions, copy_from_data, multipliers,
+                         copy_to_data, dimension + 1);
+    copy_from_data += stride_size;
+    copy_to_data += tiled_stride_size;
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+  CopyMultipleTimes(out_data, total_tiled_stride_size,
+                    multipliers[dimension] - 1,
+                    out_data + total_tiled_stride_size);
+  return std::make_pair(total_stride_size,
+                        total_tiled_stride_size * multipliers[dimension]);
+}
+
+template <ArrayDataType Type>
+inline void Tile(const Array& input_array, const Array& multiples_array,
+                 Array* output_array) {
+  // Allocate output storage.
+  auto& output_data = output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+
+  switch (multiples_array.data_type) {
+    case ArrayDataType::kInt32:
+      TileOneDimension(
+          input_array.shape(), input_array.GetBuffer<Type>().data.data(),
+          multiples_array.GetBuffer<ArrayDataType::kInt32>().data.data(),
+          output_array->GetMutableBuffer<Type>().data.data(), 0);
+      break;
+    case ArrayDataType::kInt64:
+      TileOneDimension(
+          input_array.shape(), input_array.GetBuffer<Type>().data.data(),
+          multiples_array.GetBuffer<ArrayDataType::kInt64>().data.data(),
+          output_array->GetMutableBuffer<Type>().data.data(), 0);
+      break;
+    default:
+      CHECK(false);
+      break;
+  }
+}
+
+}  // namespace
+
+// Resolves a constant Tile operation.
+bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTile) {
+    return false;
+  }
+  const auto* op = static_cast<const TensorFlowTileOperator*>(base_op);
+
+  CHECK_GE(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  const Array& multiples_array = model->GetArray(op->inputs[1]);
+  CHECK(multiples_array.data_type == ArrayDataType::kInt32 ||
+        multiples_array.data_type == ArrayDataType::kInt64)
+      << "Only int32/int64 indices are supported";
+
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Tile<ArrayDataType::kFloat>(input_array, multiples_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      Tile<ArrayDataType::kUint8>(input_array, multiples_array, &output_array);
+      break;
+    case ArrayDataType::kInt16:
+      Tile<ArrayDataType::kInt16>(input_array, multiples_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      Tile<ArrayDataType::kInt32>(input_array, multiples_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      Tile<ArrayDataType::kInt64>(input_array, multiples_array, &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Tile op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used after we remove the op.
+  DeleteArrayIfUsedOnce(op->inputs[0], model);
+  DeleteArrayIfUsedOnce(op->inputs[1], model);
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 1fd2031..fe15dfa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -128,13 +128,7 @@
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
 
-  if (input_array.minmax) {
-    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
-  }
-  if (input_array.quantization_params) {
-    output_array.GetOrCreateQuantizationParams() =
-        input_array.GetQuantizationParams();
-  }
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index fe3882c..475415e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -246,8 +246,8 @@
       }
       output_float_data[i] = outval;
     }
-  } else if (unary_op->type == OperatorType::kRelu6 &&
-             unary_op->type == OperatorType::kRelu1 &&
+  } else if (unary_op->type == OperatorType::kRelu6 ||
+             unary_op->type == OperatorType::kRelu1 ||
              unary_op->type == OperatorType::kRelu) {
     for (size_t i = 0; i < output_buffer_size; ++i) {
       const float value = (*input_float_data)[i];
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index da8e7a2..8bef440 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -92,7 +92,9 @@
       if (*input_it == switch_op->outputs[nonselected_output_index]) {
         // Let us guard our assumption that only Merge nodes consume the outputs
         // of Switch nodes:
-        CHECK(other_op->type == OperatorType::kMerge);
+        CHECK(other_op->type == OperatorType::kMerge)
+            << "Found " << HelpfulOperatorTypeName(*other_op)
+            << " as non-selected output from Switch, but only Merge supported.";
         input_it = other_op->inputs.erase(input_it);
       } else {
         ++input_it;
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index d8d331f..b7fffbc 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1223,11 +1223,10 @@
   return tensorflow::Status::OK();
 }
 
-template <typename Op, const char* op_name>
+template <typename Op>
 tensorflow::Status ConvertArgMinMaxOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  CHECK_EQ(node.op(), op_name);
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
   const auto axis_data_type =
       HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32;
@@ -1245,6 +1244,20 @@
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertArgMaxOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "ArgMax");
+  return ConvertArgMinMaxOperator<ArgMaxOperator>(node, tf_import_flags, model);
+}
+
+tensorflow::Status ConvertArgMinOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "ArgMin");
+  return ConvertArgMinMaxOperator<ArgMinOperator>(node, tf_import_flags, model);
+}
+
 tensorflow::Status ConvertResizeBilinearOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1899,17 +1912,14 @@
     Model* model);
 using ConverterMapType = std::unordered_map<std::string, ConverterType>;
 
-constexpr char kArgMax[] = "ArgMax";
-constexpr char kArgMin[] = "ArgMin";
-
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
       {"AddN", ConvertSimpleOperator<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
       {"Any", ConvertAnyOperator},
-      {"ArgMax", ConvertArgMinMaxOperator<ArgMaxOperator, kArgMax>},
-      {"ArgMin", ConvertArgMinMaxOperator<ArgMinOperator, kArgMin>},
+      {"ArgMax", ConvertArgMaxOperator},
+      {"ArgMin", ConvertArgMinOperator},
       {"Assert", ConvertSimpleOperator<TensorFlowAssertOperator>},
       {"AvgPool", ConvertAvgPoolOperator},
       {"BatchMatMul", ConvertBatchMatMulOperator},
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 18c78e3..412e14c 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -2071,7 +2071,7 @@
   std::size_t transient_data_size = 0;
   // For code-generation only: required alignment of the transient_data buffer
   std::size_t transient_data_alignment = 0;
-  // Arithmatic operations performed in the model.
+  // Arithmetic operations performed in the model.
   int64 ops_count = 0;
 
  private:
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/contrib/lite/toco/python/toco_python_api.h
index 7e8ad9c..ee054bb 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.h
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.h
@@ -12,8 +12,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
-#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
 
 #include <Python.h>
 #include <string>
@@ -33,4 +33,4 @@
 
 }  // namespace toco
 
-#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index 83e977d..709c536 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -27,6 +27,7 @@
         "//tensorflow/contrib/lite/toco:graph_transformations",
         "//tensorflow/contrib/lite/toco:model",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 9ff89e9..75808f2 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -21,9 +21,9 @@
 #include "tensorflow/contrib/lite/toco/tflite/custom_operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/simple_operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
-
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace toco {
 
@@ -1235,162 +1235,175 @@
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   std::vector<std::unique_ptr<BaseOperator>> ops;
-
+  using tensorflow::MakeUnique;
   // Builtin Operators.
-  ops.emplace_back(new Add(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
-  ops.emplace_back(new Div(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
-  ops.emplace_back(new Sub(::tflite::BuiltinOperator_SUB, OperatorType::kSub));
-  ops.emplace_back(new AveragePool(::tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                                   OperatorType::kAveragePool));
-  ops.emplace_back(
-      new SpaceToBatchND(::tflite::BuiltinOperator_SPACE_TO_BATCH_ND,
-                         OperatorType::kSpaceToBatchND));
-  ops.emplace_back(
-      new BatchToSpaceND(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND,
-                         OperatorType::kBatchToSpaceND));
-  ops.emplace_back(new Concatenation(::tflite::BuiltinOperator_CONCATENATION,
-                                     OperatorType::kConcatenation));
-  ops.emplace_back(
-      new Convolution(::tflite::BuiltinOperator_CONV_2D, OperatorType::kConv));
-  ops.emplace_back(
-      new DepthwiseConvolution(::tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               OperatorType::kDepthwiseConv));
-  ops.emplace_back(new FullyConnected(::tflite::BuiltinOperator_FULLY_CONNECTED,
-                                      OperatorType::kFullyConnected));
-  ops.emplace_back(
-      new Gather(::tflite::BuiltinOperator_GATHER, OperatorType::kGather));
-  ops.emplace_back(
-      new L2Normalization(::tflite::BuiltinOperator_L2_NORMALIZATION,
-                          OperatorType::kL2Normalization));
-  ops.emplace_back(
-      new L2Pool(::tflite::BuiltinOperator_L2_POOL_2D, OperatorType::kL2Pool));
-  ops.emplace_back(new LocalResponseNormalization(
+  ops.push_back(
+      MakeUnique<Add>(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.push_back(
+      MakeUnique<Div>(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
+  ops.push_back(
+      MakeUnique<Sub>(::tflite::BuiltinOperator_SUB, OperatorType::kSub));
+  ops.push_back(MakeUnique<AveragePool>(
+      ::tflite::BuiltinOperator_AVERAGE_POOL_2D, OperatorType::kAveragePool));
+  ops.push_back(
+      MakeUnique<SpaceToBatchND>(::tflite::BuiltinOperator_SPACE_TO_BATCH_ND,
+                                 OperatorType::kSpaceToBatchND));
+  ops.push_back(
+      MakeUnique<BatchToSpaceND>(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND,
+                                 OperatorType::kBatchToSpaceND));
+  ops.push_back(MakeUnique<Concatenation>(
+      ::tflite::BuiltinOperator_CONCATENATION, OperatorType::kConcatenation));
+  ops.push_back(MakeUnique<Convolution>(::tflite::BuiltinOperator_CONV_2D,
+                                        OperatorType::kConv));
+  ops.push_back(MakeUnique<DepthwiseConvolution>(
+      ::tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      OperatorType::kDepthwiseConv));
+  ops.push_back(
+      MakeUnique<FullyConnected>(::tflite::BuiltinOperator_FULLY_CONNECTED,
+                                 OperatorType::kFullyConnected));
+  ops.push_back(MakeUnique<Gather>(::tflite::BuiltinOperator_GATHER,
+                                   OperatorType::kGather));
+  ops.push_back(
+      MakeUnique<L2Normalization>(::tflite::BuiltinOperator_L2_NORMALIZATION,
+                                  OperatorType::kL2Normalization));
+  ops.push_back(MakeUnique<L2Pool>(::tflite::BuiltinOperator_L2_POOL_2D,
+                                   OperatorType::kL2Pool));
+  ops.push_back(MakeUnique<LocalResponseNormalization>(
       ::tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
       OperatorType::kLocalResponseNormalization));
-  ops.emplace_back(new MaxPool(::tflite::BuiltinOperator_MAX_POOL_2D,
-                               OperatorType::kMaxPool));
-  ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
-  ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
-  ops.emplace_back(
-      new PadV2(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2));
-  ops.emplace_back(
-      new Reshape(::tflite::BuiltinOperator_RESHAPE, OperatorType::kReshape));
-  ops.emplace_back(
-      new Softmax(::tflite::BuiltinOperator_SOFTMAX, OperatorType::kSoftmax));
-  ops.emplace_back(new SpaceToDepth(::tflite::BuiltinOperator_SPACE_TO_DEPTH,
-                                    OperatorType::kSpaceToDepth));
-  ops.emplace_back(
-      new Svdf(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf));
-  ops.emplace_back(new Transpose(::tflite::BuiltinOperator_TRANSPOSE,
-                                 OperatorType::kTranspose));
-  ops.emplace_back(
-      new Mean(::tflite::BuiltinOperator_MEAN, OperatorType::kMean));
-  ops.emplace_back(new Sum(::tflite::BuiltinOperator_SUM, OperatorType::kSum));
-  ops.emplace_back(new ReduceProd(::tflite::BuiltinOperator_REDUCE_PROD,
-                                  OperatorType::kReduceProd));
-  ops.emplace_back(new ReduceMax(::tflite::BuiltinOperator_REDUCE_MAX,
-                                 OperatorType::kReduceMax));
-  ops.emplace_back(new ResizeBilinear(::tflite::BuiltinOperator_RESIZE_BILINEAR,
-                                      OperatorType::kResizeBilinear));
-  ops.emplace_back(
-      new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze));
-  ops.emplace_back(
-      new Split(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit));
-  ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE,
-                                    OperatorType::kStridedSlice));
-  ops.emplace_back(
-      new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2));
-  ops.emplace_back(
-      new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
-  ops.emplace_back(
-      new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
-  ops.emplace_back(
-      new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
-  ops.emplace_back(
-      new ArgMin(::tflite::BuiltinOperator_ARG_MIN, OperatorType::kArgMin));
-  ops.emplace_back(
-      new Tile(::tflite::BuiltinOperator_TILE, OperatorType::kTile));
-  ops.emplace_back(new ExpandDims(::tflite::BuiltinOperator_EXPAND_DIMS,
-                                  OperatorType::kExpandDims));
-  ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV,
-                                     OperatorType::kTransposeConv));
-  ops.emplace_back(new SparseToDense(::tflite::BuiltinOperator_SPARSE_TO_DENSE,
-                                     OperatorType::kSparseToDense));
-  ops.emplace_back(
-      new Shape(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape));
-  ops.emplace_back(new FakeQuant(::tflite::BuiltinOperator_FAKE_QUANT,
-                                 OperatorType::kFakeQuant));
-  ops.emplace_back(
-      new Pack(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
-  ops.emplace_back(
-      new OneHot(::tflite::BuiltinOperator_ONE_HOT, OperatorType::kOneHot));
+  ops.push_back(MakeUnique<MaxPool>(::tflite::BuiltinOperator_MAX_POOL_2D,
+                                    OperatorType::kMaxPool));
+  ops.push_back(
+      MakeUnique<Mul>(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+  ops.push_back(
+      MakeUnique<Pad>(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
+  ops.push_back(
+      MakeUnique<PadV2>(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2));
+  ops.push_back(MakeUnique<Reshape>(::tflite::BuiltinOperator_RESHAPE,
+                                    OperatorType::kReshape));
+  ops.push_back(MakeUnique<Softmax>(::tflite::BuiltinOperator_SOFTMAX,
+                                    OperatorType::kSoftmax));
+  ops.push_back(MakeUnique<SpaceToDepth>(
+      ::tflite::BuiltinOperator_SPACE_TO_DEPTH, OperatorType::kSpaceToDepth));
+  ops.push_back(
+      MakeUnique<Svdf>(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf));
+  ops.push_back(MakeUnique<Transpose>(::tflite::BuiltinOperator_TRANSPOSE,
+                                      OperatorType::kTranspose));
+  ops.push_back(
+      MakeUnique<Mean>(::tflite::BuiltinOperator_MEAN, OperatorType::kMean));
+  ops.push_back(
+      MakeUnique<Sum>(::tflite::BuiltinOperator_SUM, OperatorType::kSum));
+  ops.push_back(MakeUnique<ReduceProd>(::tflite::BuiltinOperator_REDUCE_PROD,
+                                       OperatorType::kReduceProd));
+  ops.push_back(MakeUnique<ReduceMax>(::tflite::BuiltinOperator_REDUCE_MAX,
+                                      OperatorType::kReduceMax));
+  ops.push_back(
+      MakeUnique<ResizeBilinear>(::tflite::BuiltinOperator_RESIZE_BILINEAR,
+                                 OperatorType::kResizeBilinear));
+  ops.push_back(MakeUnique<Squeeze>(::tflite::BuiltinOperator_SQUEEZE,
+                                    OperatorType::kSqueeze));
+  ops.push_back(
+      MakeUnique<Split>(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit));
+  ops.push_back(MakeUnique<StridedSlice>(
+      ::tflite::BuiltinOperator_STRIDED_SLICE, OperatorType::kStridedSlice));
+  ops.push_back(MakeUnique<TopK_V2>(::tflite::BuiltinOperator_TOPK_V2,
+                                    OperatorType::kTopK_V2));
+  ops.push_back(MakeUnique<Lstm>(::tflite::BuiltinOperator_LSTM,
+                                 OperatorType::kLstmCell));
+  ops.push_back(
+      MakeUnique<Cast>(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
+  ops.push_back(MakeUnique<ArgMax>(::tflite::BuiltinOperator_ARG_MAX,
+                                   OperatorType::kArgMax));
+  ops.push_back(MakeUnique<ArgMin>(::tflite::BuiltinOperator_ARG_MIN,
+                                   OperatorType::kArgMin));
+  ops.push_back(
+      MakeUnique<Tile>(::tflite::BuiltinOperator_TILE, OperatorType::kTile));
+  ops.push_back(MakeUnique<ExpandDims>(::tflite::BuiltinOperator_EXPAND_DIMS,
+                                       OperatorType::kExpandDims));
+  ops.push_back(MakeUnique<TransposeConv>(
+      ::tflite::BuiltinOperator_TRANSPOSE_CONV, OperatorType::kTransposeConv));
+  ops.push_back(MakeUnique<SparseToDense>(
+      ::tflite::BuiltinOperator_SPARSE_TO_DENSE, OperatorType::kSparseToDense));
+  ops.push_back(
+      MakeUnique<Shape>(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape));
+  ops.push_back(MakeUnique<FakeQuant>(::tflite::BuiltinOperator_FAKE_QUANT,
+                                      OperatorType::kFakeQuant));
+  ops.push_back(
+      MakeUnique<Pack>(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
+  ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
+                                   OperatorType::kOneHot));
 
   // Custom Operators.
-  ops.emplace_back(
-      new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
-  ops.emplace_back(new CTCBeamSearchDecoder(
+  ops.push_back(
+      MakeUnique<DepthToSpace>("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
+  ops.push_back(MakeUnique<CTCBeamSearchDecoder>(
       "CTC_BEAM_SEARCH_DECODER", OperatorType::kCTCBeamSearchDecoder));
-  ops.emplace_back(new TensorFlowUnsupported("TENSORFLOW_UNSUPPORTED",
-                                             OperatorType::kUnsupported));
+  ops.push_back(MakeUnique<TensorFlowUnsupported>("TENSORFLOW_UNSUPPORTED",
+                                                  OperatorType::kUnsupported));
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
-  ops.emplace_back(
-      new SimpleOperator<AddNOperator>("ADDN", OperatorType::kAddN));
+  ops.push_back(
+      MakeUnique<SimpleOperator<AddNOperator>>("ADDN", OperatorType::kAddN));
   // Simple Operators.
-  ops.emplace_back(new SimpleOperator<DequantizeOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<DequantizeOperator>>(
       "DEQUANTIZE", OperatorType::kDequantize));
-  ops.emplace_back(
-      new SimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor));
-  ops.emplace_back(
-      new SimpleOperator<ReluOperator>("RELU", OperatorType::kRelu));
-  ops.emplace_back(
-      new SimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1));
-  ops.emplace_back(
-      new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
-  ops.emplace_back(
-      new SimpleOperator<PReluOperator>("PRELU", OperatorType::kPRelu));
-  ops.emplace_back(new SimpleOperator<LogisticOperator>(
+  ops.push_back(
+      MakeUnique<SimpleOperator<FloorOperator>>("FLOOR", OperatorType::kFloor));
+  ops.push_back(
+      MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
+  ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
+      "RELU_N1_TO_1", OperatorType::kRelu1));
+  ops.push_back(
+      MakeUnique<SimpleOperator<Relu6Operator>>("RELU6", OperatorType::kRelu6));
+  ops.push_back(
+      MakeUnique<SimpleOperator<PReluOperator>>("PRELU", OperatorType::kPRelu));
+  ops.push_back(MakeUnique<SimpleOperator<LogisticOperator>>(
       "LOGISTIC", OperatorType::kLogistic));
-  ops.emplace_back(
-      new SimpleOperator<TanhOperator>("TANH", OperatorType::kTanh));
-  ops.emplace_back(new SimpleOperator<ExpOperator>("EXP", OperatorType::kExp));
-  ops.emplace_back(new SimpleOperator<LogSoftmaxOperator>(
+  ops.push_back(
+      MakeUnique<SimpleOperator<TanhOperator>>("TANH", OperatorType::kTanh));
+  ops.push_back(
+      MakeUnique<SimpleOperator<ExpOperator>>("EXP", OperatorType::kExp));
+  ops.push_back(MakeUnique<SimpleOperator<LogSoftmaxOperator>>(
       "LOG_SOFTMAX", OperatorType::kLogSoftmax));
-  ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMaximumOperator>>(
       "MAXIMUM", OperatorType::kMaximum));  //  Element-wise Maximum
-  ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMinimumOperator>>(
       "MINIMUM", OperatorType::kMinimum));  //  Element-wise Minimum
-  ops.emplace_back(new SimpleOperator<TensorFlowGreaterOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterOperator>>(
       "GREATER", OperatorType::kGreater));
-  ops.emplace_back(new SimpleOperator<TensorFlowGreaterEqualOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterEqualOperator>>(
       "GREATER_EQUAL", OperatorType::kGreaterEqual));
-  ops.emplace_back(
-      new SimpleOperator<TensorFlowLessOperator>("LESS", OperatorType::kLess));
-  ops.emplace_back(new SimpleOperator<TensorFlowLessEqualOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessOperator>>(
+      "LESS", OperatorType::kLess));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessEqualOperator>>(
       "LESS_EQUAL", OperatorType::kLessEqual));
-  ops.emplace_back(new SimpleOperator<TensorFlowEqualOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowEqualOperator>>(
       "EQUAL", OperatorType::kEqual));
-  ops.emplace_back(new SimpleOperator<TensorFlowNotEqualOperator>(
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowNotEqualOperator>>(
       "NOT_EQUAL", OperatorType::kNotEqual));
-  ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
-  ops.emplace_back(
-      new SimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect));
-  ops.emplace_back(
-      new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
-  ops.emplace_back(new SimpleOperator<PowOperator>("POW", OperatorType::kPow));
-  ops.emplace_back(new SimpleOperator<LogicalOrOperator>(
+  ops.push_back(
+      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
+  ops.push_back(MakeUnique<SimpleOperator<SelectOperator>>(
+      "SELECT", OperatorType::kSelect));
+  ops.push_back(
+      MakeUnique<SimpleOperator<SliceOperator>>("SLICE", OperatorType::kSlice));
+  ops.push_back(
+      MakeUnique<SimpleOperator<PowOperator>>("POW", OperatorType::kPow));
+  ops.push_back(MakeUnique<SimpleOperator<LogicalOrOperator>>(
       "LOGICAL_OR", OperatorType::kLogicalOr));
   ops.emplace_back(new SimpleOperator<LogicalAndOperator>(
       "LOGICAL_AND", OperatorType::kLogicalAnd));
   ops.emplace_back(new SimpleOperator<LogicalNotOperator>(
       "LOGICAL_NOT", OperatorType::kLogicalNot));
   // Element-wise operator
-  ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
-  ops.emplace_back(new SimpleOperator<LogOperator>("LOG", OperatorType::kLog));
-  ops.emplace_back(
-      new SimpleOperator<TensorFlowSqrtOperator>("SQRT", OperatorType::kSqrt));
-  ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
+  ops.push_back(
+      MakeUnique<SimpleOperator<SinOperator>>("SIN", OperatorType::kSin));
+  ops.push_back(
+      MakeUnique<SimpleOperator<LogOperator>>("LOG", OperatorType::kLog));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowSqrtOperator>>(
+      "SQRT", OperatorType::kSqrt));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowRsqrtOperator>>(
       "RSQRT", OperatorType::kRsqrt));
 
   return ops;
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 14168fa..204c0d1 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -138,13 +138,15 @@
 #define close _close
 #define open _open
 #define read _read
-#define O_RDONLY _O_RDONLY
-#define O_CREAT _O_CREAT
-#define O_WRONLY _O_WRONLY
-// Windows does not support the same set of file permissions as other platforms.
+// Windows does not support the same set of file permissions as other platforms,
+// and also requires an explicit flag for binary file read/write support.
 constexpr int kFileCreateMode = _S_IREAD | _S_IWRITE;
+constexpr int kFileReadFlags = _O_RDONLY | _O_BINARY;
+constexpr int kFileWriteFlags = _O_WRONLY | _O_BINARY | _O_CREAT;
 #else
 constexpr int kFileCreateMode = 0664;
+constexpr int kFileReadFlags = O_RDONLY;
+constexpr int kFileWriteFlags = O_CREAT | O_WRONLY;
 #endif  // _WIN32
 
 static bool port_initialized = false;
@@ -197,7 +199,7 @@
                                const file::Options& options) {
   output->clear();
 
-  int fd = open(path.c_str(), O_RDONLY);
+  int fd = open(path.c_str(), kFileReadFlags);
   if (fd == -1) {
     return tensorflow::errors::NotFound("can't open() for read");
   }
@@ -226,7 +228,7 @@
 
 tensorflow::Status SetContents(const string& filename, const string& contents,
                                const file::Options& options) {
-  int fd = open(filename.c_str(), O_WRONLY | O_CREAT, kFileCreateMode);
+  int fd = open(filename.c_str(), kFileWriteFlags, kFileCreateMode);
   if (fd == -1) {
     return tensorflow::errors::Internal("can't open() for write");
   }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index fcd3cba..34130a0 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -90,8 +90,10 @@
   transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantReshape);
+  transformations->Add(new ResolveConstantSelect);
   transformations->Add(new ResolveConstantSlice);
   transformations->Add(new ResolveConstantStridedSlice);
+  transformations->Add(new ResolveConstantTile);
   transformations->Add(new ResolveConstantTranspose);
   transformations->Add(new ResolveConstantUnaryOperator);
   transformations->Add(new ResolveTensorFlowMerge);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 80df09e..3a4542f 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -602,14 +602,33 @@
   shape_dims.erase(shape_dims.begin(), shape_dims.begin() + size_reduction);
 }
 
-bool IsValid(const Shape& shape) {
+// In general, zero-sized dimensions are disallowed, but there are exceptions,
+// e.g., if the tensor data itself represents a scalar (rank 0) shape, its
+// shape will have dimensions [0]. CheckNonEmptyShapeDimensions is more
+// strict, and is appropriate for ops and comparisons where an empty shape
+// doesn't make sense.
+template <typename Dims>
+void CheckValidShapeDimensions(const Dims& dims) {
+  if (dims.size() == 1 && dims[0] == 0) {
+    return;
+  }
+  for (const auto& dim : dims) {
+    CHECK_GE(dim, 1);
+  }
+}
+
+void CheckValidShape(const Shape& shape) {
+  CheckValidShapeDimensions(shape.dims());
+}
+
+bool IsNonEmpty(const Shape& shape) {
   for (int i = 0; i < shape.dimensions_count(); ++i) {
     if (shape.dims(i) < 1) return false;
   }
   return true;
 }
 
-void CheckShapeDimensions(const Shape& shape) {
+void CheckNonEmptyShapeDimensions(const Shape& shape) {
   for (int i = 0; i < shape.dimensions_count(); ++i) {
     CHECK_GE(shape.dims()[i], 1) << "shape has dimension 0 at index << " << i
                                  << ". shape = " << ShapeToString(shape);
@@ -617,8 +636,8 @@
 }
 
 bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1) {
-  CheckShapeDimensions(shape0);
-  CheckShapeDimensions(shape1);
+  CheckNonEmptyShapeDimensions(shape0);
+  CheckNonEmptyShapeDimensions(shape1);
 
   const Shape* longer = &shape0;
   const Shape* shorter = &shape1;
@@ -645,8 +664,8 @@
 }
 
 bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1) {
-  CheckShapeDimensions(shape0);
-  CheckShapeDimensions(shape1);
+  CheckNonEmptyShapeDimensions(shape0);
+  CheckNonEmptyShapeDimensions(shape1);
 
   const Shape* longer = &shape0;
   const Shape* shorter = &shape1;
@@ -683,9 +702,9 @@
 }
 
 int RequiredBufferSizeForShape(const Shape& shape) {
+  CheckValidShape(shape);
   int max_offset = 1;
   for (const auto& dim : shape.dims()) {
-    CHECK_GE(dim, 1);
     max_offset *= dim;
   }
   return max_offset;
@@ -946,13 +965,7 @@
       // shape.
       CHECK(array->has_shape());
       // Constant buffer should has a valid shape.
-      bool is_scalar =
-          array->shape().dimensions_count() == 1 && array->shape().dims(0) == 0;
-      if (!is_scalar) {
-        for (int d : array->shape().dims()) {
-          CHECK_GE(d, 1);
-        }
-      }
+      CheckValidShape(array->shape());
       // The shape flat-size should agree with the buffer length.
       CHECK_EQ(array->buffer->Length(),
                RequiredBufferSizeForShape(array->shape()));
@@ -1544,8 +1557,8 @@
     if (!input_array.has_shape()) {
       if (input_array_proto.has_shape()) {
         auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
+        CheckValidShapeDimensions(input_array_proto.shape().dims());
         for (auto dim : input_array_proto.shape().dims()) {
-          CHECK_GE(dim, 1);
           input_array_dims.push_back(dim);
         }
       }
@@ -2265,4 +2278,14 @@
   }
 }
 
+void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst) {
+  if (src.minmax) {
+    dst->GetOrCreateMinMax() = src.GetMinMax();
+  }
+  if (src.quantization_params) {
+    dst->GetOrCreateQuantizationParams() = src.GetQuantizationParams();
+  }
+  dst->narrow_range = src.narrow_range;
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 5dbfa54..bdeb203 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -115,10 +115,9 @@
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void UnextendShape(Shape* shape, int new_shape_size);
 
-// Checks that all dimensions of 'shape' are at least 1.
-bool IsValid(const Shape& shape);
-// Same as above, but reports error using CHECK.
-void CheckShapeDimensions(const Shape& shape);
+// Checks that all dimensions of 'shape' are at least 1. Note that scalars,
+// lacking dimensions, satisfy this condition and are considered non-empty.
+bool IsNonEmpty(const Shape& shape);
 
 // Given two shapes with potentially different dimensionality and dimension
 // arrays d0 and d1. Without loss of generality, assume that shape0 may have
@@ -349,6 +348,9 @@
 // so that the rest of toco doesn't need to know about shuffled weights.
 void UndoWeightsShuffling(Model* model);
 
+// Copies minmax, quantization_params, and narrow_range.
+void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/tools/make/Makefile
similarity index 67%
rename from tensorflow/contrib/lite/Makefile
rename to tensorflow/contrib/lite/tools/make/Makefile
index 9cc8f10..e30cc1d 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/tools/make/Makefile
@@ -6,120 +6,74 @@
 # Try to figure out the host system
 HOST_OS :=
 ifeq ($(OS),Windows_NT)
-	HOST_OS = WINDOWS
+	HOST_OS = windows
 else
 	UNAME_S := $(shell uname -s)
 	ifeq ($(UNAME_S),Linux)
-	        HOST_OS := LINUX
+		HOST_OS := linux
 	endif
 	ifeq ($(UNAME_S),Darwin)
-		HOST_OS := OSX
+		HOST_OS := osx
 	endif
 endif
 
 HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
 
-# Self-hosting
-TARGET_ARCH := ${HOST_ARCH}
+# Override these on the make command line to target a specific architecture. For example:
+# make -f tensorflow/contrib/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
+TARGET := $(HOST_OS)
+TARGET_ARCH := $(HOST_ARCH)
 
-# Cross compiling
-ifeq ($(CROSS),rpi)
-  TARGET_ARCH := armv7l
-  TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
-endif
+# These are the default libraries needed, but they can be added to or
+# overridden by the platform-specific settings in target makefiles.
+LIBS := \
+-lstdc++ \
+-lpthread \
+-lm \
+-lz
 
-ifeq ($(CROSS),riscv)
-  TARGET_ARCH := riscv
-  TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
-endif
-ifeq ($(CROSS),stm32f7)
-  TARGET_ARCH := armf7
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-endif
-ifeq ($(CROSS),stm32f1)
-  TARGET_ARCH := armm1
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-endif
-
-# Where compiled objects are stored.
-OBJDIR := $(MAKEFILE_DIR)/gen/obj/
-BINDIR := $(MAKEFILE_DIR)/gen/bin/
-LIBDIR := $(MAKEFILE_DIR)/gen/lib/
-GENDIR := $(MAKEFILE_DIR)/gen/obj/
-
-LIBS :=
-ifeq ($(TARGET_ARCH),x86_64)
-        CXXFLAGS += -fPIC -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -pthread # -msse4.2
-endif
-
-ifeq ($(TARGET_ARCH),armv7l)
-        CXXFLAGS += -mfpu=neon -pthread -fPIC
-	LIBS += -ldl
-endif
-
-ifeq ($(TARGET_ARCH),riscv)
-#        CXXFLAGS += -march=gap8
-        CXXFLAGS += -DTFLITE_MCU
-	LIBS += -ldl
-	BUILD_TYPE := micro
-endif
-
-ifeq ($(TARGET_ARCH),armf7)
-        CXXFLAGS += -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_MCU
-        CXXFLAGS += -fno-rtti -fmessage-length=0 -fno-exceptions -fno-builtin -ffunction-sections -fdata-sections
-        CXXFLAGS += -funsigned-char -MMD
-        CXXFLAGS += -mcpu=cortex-m7 -mthumb -mfpu=fpv5-sp-d16 -mfloat-abi=softfp
-        CXXFLAGS += '-std=gnu++11' '-fno-rtti' '-Wvla' '-c' '-Wall' '-Wextra' '-Wno-unused-parameter' '-Wno-missing-field-initializers' '-fmessage-length=0' '-fno-exceptions' '-fno-builtin' '-ffunction-sections' '-fdata-sections' '-funsigned-char' '-MMD' '-fno-delete-null-pointer-checks' '-fomit-frame-pointer' '-Os'
-	LIBS += -ldl
-	BUILD_TYPE := micro
-endif
-ifeq ($(TARGET_ARCH),armm1)
-        CXXFLAGS += -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -mcpu=cortex-m1 -mthumb -DTFLITE_MCU
-        CXXFLAGS += -fno-rtti -fmessage-length=0 -fno-exceptions -fno-builtin -ffunction-sections -fdata-sections
-        CXXFLAGS += -funsigned-char -MMD
-	LIBS += -ldl
-endif
-
-# Settings for the host compiler.
-CXX := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}g++
-CXXFLAGS += -O3 -DNDEBUG
+# There are no rules for compiling objects for the host system (since we don't
+# generate things like the protobuf compiler that require that), so all of
+# these settings are for the target compiler.
+CXXFLAGS := -O3 -DNDEBUG
 CCFLAGS := ${CXXFLAGS}
 CXXFLAGS += --std=c++11
-CC := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}gcc
-AR := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}ar
 CFLAGS :=
-LDOPTS :=
-LDOPTS += -L/usr/local/lib
+LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
+TARGET_TOOLCHAIN_PREFIX :=
+CC_PREFIX :=
+
+# These target-specific makefiles should modify or replace options like
+# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# based on platforms or architectures should happen within these files, to
+# keep this main makefile focused on the sources and dependencies.
+include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+
+# Where compiled objects are stored.
+GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
+OBJDIR := $(GENDIR)obj/
+BINDIR := $(GENDIR)bin/
+LIBDIR := $(GENDIR)lib/
 
 INCLUDES := \
 -I. \
--I$(MAKEFILE_DIR)/../../../ \
--I$(MAKEFILE_DIR)/../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../../ \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/neon_2_sse \
 -I$(MAKEFILE_DIR)/downloads/farmhash/src \
 -I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
--I$(GENDIR)
+-I$(OBJDIR)
 # This is at the end so any globally-installed frameworks like protobuf don't
 # override local versions in the source tree.
 INCLUDES += -I/usr/local/include
 
-LIBS += \
--lstdc++ \
--lpthread \
--lm \
--lz
-
-# If we're on Linux, also link in the dl library.
-ifeq ($(HOST_OS),LINUX)
-	LIBS += -ldl
-endif
-
-include $(MAKEFILE_DIR)/ios_makefile.inc
-include $(MAKEFILE_DIR)/rpi_makefile.inc
+CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
+CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
+AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
@@ -163,8 +117,8 @@
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
-$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
+$(wildcard tensorflow/contrib/lite/tools/make/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/tools/make/downloads/fft2d/fftsg.c)
 endif
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
@@ -179,10 +133,6 @@
 CORE_CC_EXCLUDE_SRCS += \
 tensorflow/contrib/lite/mmap_allocation.cc \
 tensorflow/contrib/lite/nnapi_delegate.cc
-else
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/contrib/lite/mmap_allocation_disabled.cc \
-tensorflow/contrib/lite/nnapi_delegate_disabled.cc
 endif
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
similarity index 66%
rename from tensorflow/contrib/lite/build_ios_universal_lib.sh
rename to tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
index 31df43a..fe05694 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
@@ -17,23 +17,23 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../.."
+cd "$SCRIPT_DIR/../../../../.."
 
 # Build library for supported architectures and packs them in a fat binary.
 make_library() {
     for arch in x86_64 armv7 armv7s arm64
     do
-        make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=${arch} \
-        -j 8 \
-        $SCRIPT_DIR/gen/lib/ios_${arch}/${1}
+        make -f tensorflow/contrib/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \
+        -j 8
     done
+    mkdir -p tensorflow/contrib/lite/tools/make/gen/lib
     lipo \
-    tensorflow/contrib/lite/gen/lib/ios_x86_64/${1} \
-    tensorflow/contrib/lite/gen/lib/ios_armv7/${1} \
-    tensorflow/contrib/lite/gen/lib/ios_armv7s/${1} \
-    tensorflow/contrib/lite/gen/lib/ios_arm64/${1} \
+    tensorflow/contrib/lite/tools/make/gen/ios_x86_64/lib/${1} \
+    tensorflow/contrib/lite/tools/make/gen/ios_armv7/lib/${1} \
+    tensorflow/contrib/lite/tools/make/gen/ios_armv7s/lib/${1} \
+    tensorflow/contrib/lite/tools/make/gen/ios_arm64/lib/${1} \
     -create \
-    -output tensorflow/contrib/lite/gen/lib/${1}
+    -output tensorflow/contrib/lite/tools/make/gen/lib/${1}
 }
 
 make_library libtensorflow-lite.a
diff --git a/tensorflow/contrib/lite/build_rpi_lib.sh b/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
similarity index 90%
rename from tensorflow/contrib/lite/build_rpi_lib.sh
rename to tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
index 3824b16..24ecd43 100755
--- a/tensorflow/contrib/lite/build_rpi_lib.sh
+++ b/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
@@ -17,6 +17,6 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../.."
+cd "$SCRIPT_DIR/../../../../.."
 
-CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/Makefile TARGET=RPI TARGET_ARCH=armv7
+CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/tools/make/download_dependencies.sh
similarity index 97%
rename from tensorflow/contrib/lite/download_dependencies.sh
rename to tensorflow/contrib/lite/tools/make/download_dependencies.sh
index 8c7df47..29afa45 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/tools/make/download_dependencies.sh
@@ -17,9 +17,9 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../.."
+cd "$SCRIPT_DIR/../../../../.."
 
-DOWNLOADS_DIR=tensorflow/contrib/lite/downloads
+DOWNLOADS_DIR=tensorflow/contrib/lite/tools/make/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
 # Ensure it is being run from repo root
diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc
similarity index 67%
rename from tensorflow/contrib/lite/ios_makefile.inc
rename to tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc
index 0793205..7f36b8e 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc
@@ -1,11 +1,11 @@
 # Settings for iOS.
-ifeq ($(TARGET), IOS)
-        BUILD_FOR_IOS_SIMULATOR := false
-	ifeq ($(IOS_ARCH), x86_64)
-	     	BUILD_FOR_IOS_SIMULATOR := true
+ifeq ($(TARGET), ios)
+  BUILD_FOR_IOS_SIMULATOR := false
+	ifeq ($(TARGET_ARCH), x86_64)
+	  BUILD_FOR_IOS_SIMULATOR := true
 	endif
-	ifeq ($(IOS_ARCH), i386)
-	     	BUILD_FOR_IOS_SIMULATOR := true
+	ifeq ($(TARGET_ARCH), i386)
+	  BUILD_FOR_IOS_SIMULATOR := true
 	endif
 	ifeq ($(BUILD_FOR_IOS_SIMULATOR), true)
 		IPHONEOS_PLATFORM := $(shell xcrun --sdk iphonesimulator \
@@ -18,8 +18,8 @@
 	endif
 	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
 	MIN_SDK_VERSION := 9.0
-	# Override IOS_ARCH with armv7, armv7s, arm64, i386, or x86_64.
-	IOS_ARCH := x86_64
+	# Override TARGET_ARCH with armv7, armv7s, arm64, i386, or x86_64.
+	TARGET_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
 		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
@@ -29,21 +29,17 @@
 		-fno-exceptions \
 		-isysroot \
 		${IPHONEOS_SYSROOT} \
-		-arch $(IOS_ARCH) \
+		-arch $(TARGET_ARCH) \
 		-O3
 	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-fembed-bitcode \
 		-mno-thumb \
 		-isysroot \
 		${IPHONEOS_SYSROOT} \
-		-arch $(IOS_ARCH) \
+		-arch $(TARGET_ARCH) \
 		-O3
 	LDFLAGS := -fembed-bitcode \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
 		-framework Accelerate \
-		-arch $(IOS_ARCH)
-	OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/
-	LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/
-	BINDIR := $(BINDIR)ios_$(IOS_ARCH)/
-	DEPDIR := $(DEPDIR)ios_$(IOS_ARCH)/
+		-arch $(TARGET_ARCH)
 endif
diff --git a/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc
new file mode 100644
index 0000000..86499da
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for Linux.
+ifeq ($(TARGET), linux)
+  CXXFLAGS += \
+    -fPIC \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -pthread
+  # TODO(petewarden): In the future we may want to add architecture-specific
+  # flags like -msse4.2
+	LIBS += -ldl
+endif
diff --git a/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc
new file mode 100644
index 0000000..1a82afe
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for RiscV platforms.
+ifeq ($(TARGET), riscv)
+  TARGET_ARCH := riscv
+  TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf-
+
+  #CXXFLAGS += -march=gap8
+  CXXFLAGS += -DTFLITE_MCU
+	LIBS += -ldl
+	BUILD_TYPE := micro
+endif
diff --git a/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc
new file mode 100644
index 0000000..1ad0c50
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc
@@ -0,0 +1,60 @@
+# Settings for Raspberry Pi.
+ifeq ($(TARGET),rpi)
+  # Default to the architecture used on the Pi Two/Three (ArmV7), but override this
+  # with TARGET_ARCH=armv6 to build for the Pi Zero or One.
+  TARGET_ARCH := armv7l
+  TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
+
+  ifeq ($(TARGET_ARCH), armv7l)
+    CXXFLAGS += \
+			-march=armv7-a \
+      -mfpu=neon-vfpv4 \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    CCFLAGS += \
+      -march=armv7-a \
+      -mfpu=neon-vfpv4 \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    LDFLAGS := \
+      -Wl,--no-export-dynamic \
+      -Wl,--exclude-libs,ALL \
+      -Wl,--gc-sections \
+      -Wl,--as-needed
+  endif
+
+  # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster
+  # alternative to Eigen on non-NEON ARM hardware like armv6.
+  ifeq ($(TARGET_ARCH), armv6)
+    CXXFLAGS += \
+      -march=armv6 \
+      -mfpu=vfp \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    CCFLAGS += \
+      -march=armv6 \
+      -mfpu=vfp \
+      -funsafe-math-optimizations \
+      -ftree-vectorize \
+      -fPIC
+
+    LDFLAGS := \
+      -Wl,--no-export-dynamic \
+      -Wl,--exclude-libs,ALL \
+      -Wl,--gc-sections \
+      -Wl,--as-needed
+  endif
+       
+  LIBS := \
+    -lstdc++ \
+    -lpthread \
+    -lm \
+    -ldl
+
+endif
diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc
new file mode 100644
index 0000000..7418e4d
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc
@@ -0,0 +1,21 @@
+# Settings for STM32F1 platforms.
+ifeq ($(TARGET), stm32f1)
+  TARGET_ARCH := armm1
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  CXXFLAGS += \
+  -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+  -mcpu=cortex-m1 \
+  -mthumb \
+  -DTFLITE_MCU \
+  -fno-rtti \
+  -fmessage-length=0 \
+  -fno-exceptions \
+  -fno-builtin \
+  -ffunction-sections \
+  -fdata-sections \
+  -funsigned-char \
+  -MMD
+	LIBS += -ldl
+	BUILD_TYPE := micro
+endif
diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc
new file mode 100644
index 0000000..48af71e
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc
@@ -0,0 +1,41 @@
+# Settings for STM32F7 platforms.
+ifeq ($(TARGET), stm32f7)
+  TARGET_ARCH := armf7
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  CXXFLAGS += \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTFLITE_MCU \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m7 \
+    -mthumb \
+    -mfpu=fpv5-sp-d16 \
+    -mfloat-abi=softfp \
+    -std=gnu++11 \
+    -fno-rtti \
+    -Wvla \
+    -c \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -Os
+ 	LIBS += -ldl
+	BUILD_TYPE := micro
+endif
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
index 8ccb65c..7950653 100644
--- a/tensorflow/contrib/lite/util.cc
+++ b/tensorflow/contrib/lite/util.cc
@@ -14,8 +14,15 @@
 ==============================================================================*/
 #include "tensorflow/contrib/lite/util.h"
 
+#include <cstring>
+
 namespace tflite {
 
+bool IsEagerOp(const char* custom_name) {
+  return custom_name && strncmp(custom_name, kEagerCustomCodePrefix,
+                                strlen(kEagerCustomCodePrefix)) == 0;
+}
+
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
   return ConvertArrayToTfLiteIntArray(input.size(), input.data());
 }
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
index 3c48011..f5b208a 100644
--- a/tensorflow/contrib/lite/util.h
+++ b/tensorflow/contrib/lite/util.h
@@ -26,6 +26,16 @@
 
 namespace tflite {
 
+// The prefix of Eager op custom code.
+// This will be matched agains the `custom_code` field in `OperatorCode`
+// Flatbuffer Table.
+// WARNING: This is an experimental API and subject to change.
+constexpr char kEagerCustomCodePrefix[] = "Eager";
+
+// Checks whether the prefix of the custom name indicates the operation is an
+// Eager operation.
+bool IsEagerOp(const char* custom_name);
+
 // Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
 // of the returned pointer.
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
diff --git a/tensorflow/contrib/lite/util_test.cc b/tensorflow/contrib/lite/util_test.cc
index 04579c5..32bf917 100644
--- a/tensorflow/contrib/lite/util_test.cc
+++ b/tensorflow/contrib/lite/util_test.cc
@@ -41,6 +41,16 @@
   TfLiteIntArrayFree(output);
 }
 
+TEST(UtilTest, IsEagerOp) {
+  EXPECT_TRUE(IsEagerOp("Eager"));
+  EXPECT_TRUE(IsEagerOp("EagerOp"));
+  EXPECT_FALSE(IsEagerOp("eager"));
+  EXPECT_FALSE(IsEagerOp("Eage"));
+  EXPECT_FALSE(IsEagerOp("OpEager"));
+  EXPECT_FALSE(IsEagerOp(nullptr));
+  EXPECT_FALSE(IsEagerOp(""));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index e3928a8..83e80f2 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -34,6 +34,7 @@
         ":lookup_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
+        "//tensorflow/contrib/data",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 4942d94..291972c 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -20,7 +20,6 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.ops import lookup_ops
 # pylint: disable=unused-import
@@ -395,17 +394,12 @@
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    if keys.dtype.base_dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         (self._table_ref, keys, self._default_value)) as name:
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       with ops.colocate_with(self._table_ref):
         values = gen_lookup_ops.lookup_table_find_v2(
             self._table_ref, keys, self._default_value, name=name)
-
-        values.set_shape(keys.get_shape().concatenate(self._value_shape))
     return values
 
   def insert(self, keys, values, name=None):
@@ -425,11 +419,10 @@
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    # pylint: disable=protected-access
-    lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype)
-    # pylint: enable=protected-access
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
       with ops.colocate_with(self._table_ref):
         # pylint: disable=protected-access
         op = gen_lookup_ops.lookup_table_insert_v2(
@@ -451,9 +444,6 @@
       with ops.colocate_with(self._table_ref):
         exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
             self._table_ref, self._key_dtype, self._value_dtype, name=name)
-
-    exported_values.set_shape(exported_keys.get_shape().concatenate(
-        self._value_shape))
     return exported_keys, exported_values
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
@@ -537,14 +527,15 @@
       ValueError: If checkpoint is True and no name was specified.
     """
     self._default_value = ops.convert_to_tensor(
-        default_value, dtype=value_dtype)
+        default_value, dtype=value_dtype, name="default_value")
     self._value_shape = self._default_value.get_shape()
 
     # The table must be shared if checkpointing is requested for multi-worker
     # training to work correctly. Use the node name if no shared_name has been
     # explicitly specified.
     use_node_name_sharing = checkpoint and shared_name is None
-    empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype)
+    empty_key = ops.convert_to_tensor(
+        empty_key, dtype=key_dtype, name="empty_key")
     self._table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
         empty_key=empty_key,
         shared_name=shared_name,
@@ -591,20 +582,13 @@
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    if keys.dtype.base_dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         [self._table_ref, keys]) as name:
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       with ops.colocate_with(self._table_ref):
         values = gen_lookup_ops.lookup_table_find_v2(
             self._table_ref, keys, self._default_value, name=name)
 
-    if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0:
-      values.set_shape(
-          tensor_shape.TensorShape([keys.get_shape().dims[0]]).concatenate(
-              self._value_shape))
     return values
 
   def insert(self, keys, values, name=None):
@@ -624,11 +608,11 @@
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    # pylint: disable=protected-access
-    lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype)
-    # pylint: enable=protected-access
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(
+          values, dtype=self._value_dtype, name="values")
       with ops.colocate_with(self._table_ref):
         op = gen_lookup_ops.lookup_table_insert_v2(
             self._table_ref, keys, values, name=name)
@@ -650,8 +634,6 @@
         exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
             self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
-    exported_values.set_shape(exported_keys.get_shape().concatenate(
-        self._value_shape))
     return exported_keys, exported_values
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 8d510ed..81257e1 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -23,6 +23,7 @@
 import six
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import counter
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -434,8 +435,10 @@
       self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
 
       exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list())
-      self.assertAllEqual([None, 2], exported_values.get_shape().as_list())
+      self.assertAllEqual([None], exported_keys.get_shape().as_list(),
+                          msg="Saw shape %s" % exported_keys.shape)
+      self.assertAllEqual([None, 2], exported_values.get_shape().as_list(),
+                          msg="Saw shape %s" % exported_values.shape)
       # exported data is in the order of the internal map, i.e. undefined
       sorted_keys = np.sort(exported_keys.eval())
       sorted_values = np.sort(exported_values.eval())
@@ -644,11 +647,11 @@
                                       default_val)
 
       # insert with keys of the wrong type
-      with self.assertRaises(TypeError):
+      with self.assertRaises(ValueError):
         table.insert(constant_op.constant([4, 5, 6]), values).run()
 
       # insert with values of the wrong type
-      with self.assertRaises(TypeError):
+      with self.assertRaises(ValueError):
         table.insert(keys, constant_op.constant(["a", "b", "c"])).run()
 
       self.assertAllEqual(0, table.size().eval())
@@ -669,7 +672,7 @@
 
       # lookup with keys of the wrong type
       input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(TypeError):
+      with self.assertRaises(ValueError):
         table.lookup(input_string).eval()
 
       # default value of the wrong type
@@ -853,7 +856,8 @@
 
       input_string = constant_op.constant([11, 12, 15], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([3, 4], output.get_shape())
+      self.assertAllEqual(
+          [3, 4], output.shape, msg="Saw shape: %s" % output.shape)
 
       result = output.eval()
       self.assertAllEqual([[0, 1, 2, 3], [3, 4, 5, 6], [-1, -2, -3, -4]],
@@ -2394,5 +2398,60 @@
             hasher_spec=lookup.StrongHashSpec([None, 2]))
 
 
+class MutableHashTableBenchmark(test.Benchmark):
+
+  def _create_table(self):
+    return lookup.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
+
+  def benchmark_single_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable(1.0)
+    insert = table.insert(0, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) == 1
+
+  def benchmark_many_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    c = counter.Counter().make_one_shot_iterator().get_next()
+    value = variables.Variable(1.0)
+    insert = table.insert(c, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) >= 10000
+
+  def benchmark_single_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) == 32
+
+  def benchmark_many_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    c = counter.Counter().make_one_shot_iterator().get_next()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(32 * c + list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) >= 1000*32
+
+
+class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
+
+  def _create_table(self):
+    return lookup.MutableDenseHashTable(
+        dtypes.int64, dtypes.float32, default_value=0.0, empty_key=-1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index db58647..92b380d 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -15,7 +15,7 @@
 
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 6e9d1d4..1675387 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/losses/python/metric_learning/__init__.py b/tensorflow/contrib/losses/python/metric_learning/__init__.py
index 4e551d6..3d93a4d 100644
--- a/tensorflow/contrib/losses/python/metric_learning/__init__.py
+++ b/tensorflow/contrib/losses/python/metric_learning/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
@@ -35,5 +35,3 @@
     'triplet_semihard_loss',
 ]
 remove_undocumented(__name__, _allowed_symbols)
-
-
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index a28fc3a..cb4c94d 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -256,6 +256,7 @@
                 esac
 
                 makefile='
+			AR := ${NDK_ROOT}/toolchains/'"$toolchain"'/prebuilt/'"$android_os_arch"'/bin/'"$bin_prefix"'-ar
                         CC=${CC_PREFIX} \
                            ${NDK_ROOT}/toolchains/'"$toolchain"'/prebuilt/'"$android_os_arch"'/bin/'"$bin_prefix"'-g++
                         PLATFORM_CPPFLAGS=--sysroot \
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 448ae6d..dc9b17a 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -35,7 +35,9 @@
 # process. For now we're hardcoding to the version which is used by
 # TensorFlow 1.9.
 PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz"
-RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
+# the archive has been propagated in mirror.bazel.build.
+RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 88798d6..5645784 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Ops for evaluation metrics and summary statistics.
 
-See the @{$python/contrib.metrics} guide.
+See the
+[Contrib Metrics](https://tensorflow.org/api_guides/python/contrib.metrics)
+guide.
 
 @@auc_with_confidence_intervals
 @@streaming_accuracy
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index e553612..7053907 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -24,7 +24,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 # TODO(nsilberman): move into metrics/python/ops/
 
@@ -174,7 +174,7 @@
         ops.add_to_collections(metrics_collections, best_f1)
       return best_f1
 
-    best_f1 = distribute_lib.get_tower_context().merge_call(
+    best_f1 = distribution_strategy_context.get_tower_context().merge_call(
         f1_across_towers, values)
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
index be7377b..eba5058 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
@@ -41,12 +41,12 @@
      applied on variables.
 
   This class is used together with
-  @{tf.contrib.mixed_precision.LossScaleOptimizer} for mixed precision training
+  `tf.contrib.mixed_precision.LossScaleOptimizer` for mixed precision training
   (float32 variables and float16 ops) on Nvidia GPUs in order to achieve the
   same model quality as single precision training, with the benefits of
   potential higher throughput.
 
-  See @{tf.contrib.mixed_precision.LossScaleOptimizer} for more details.
+  See `tf.contrib.mixed_precision.LossScaleOptimizer` for more details.
   """
 
   @abc.abstractmethod
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
index 93050a3..fcce52a 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
@@ -103,7 +103,7 @@
 
     Args:
       opt: The actual optimizer that will be used to compute and apply the
-        gradients. Must be an implementation of the @{tf.train.Optimizer}
+        gradients. Must be an implementation of the `tf.train.Optimizer`
         interface.
       loss_scale_manager: A LossScaleManager object.
     """
@@ -117,7 +117,7 @@
                         aggregation_method=None,
                         colocate_gradients_with_ops=False,
                         grad_loss=None):
-    """Compute gradients. See base class @{tf.train.Optimizer}."""
+    """Compute gradients. See base class `tf.train.Optimizer`."""
     loss_scale = self._loss_scale_manager.get_loss_scale()
     if context.executing_eagerly():
 
@@ -141,7 +141,7 @@
     return self._down_scale(grads_and_vars, loss_scale)
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients. See base class @{tf.train.Optimizer}."""
+    """Apply gradients. See base class `tf.train.Optimizer`."""
     grads = [g for (g, _) in grads_and_vars]
 
     is_finite_grad = []
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index 54bd39a..16ddc38 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -95,6 +95,22 @@
     ],
 )
 
+py_library(
+    name = "strip_pruning_vars_lib",
+    srcs = ["python/strip_pruning_vars_lib.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pruning",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "pruning_utils_test",
     size = "small",
@@ -129,6 +145,31 @@
     ],
 )
 
+py_test(
+    name = "strip_pruning_vars_test",
+    size = "small",
+    srcs = ["python/strip_pruning_vars_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        ":pruning",
+        ":rnn_cells",
+        ":strip_pruning_vars_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_binary(
+    name = "strip_pruning_vars",
+    srcs = ["python/strip_pruning_vars.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":strip_pruning_vars_lib",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_library(
     name = "init_py",
     srcs = ["__init__.py"],
@@ -145,5 +186,6 @@
         ":learning",
         ":pruning",
         ":rnn_cells",
+        ":strip_pruning_vars_lib",
     ],
 )
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index dbe4e12..a5267fd 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -4,7 +4,15 @@
 neural network's weight tensors. The API helps inject necessary tensorflow op
 into the training graph so the model can be pruned while it is being trained.
 
-### Model creation
+## Table of contents
+1. [Model creation](#model-creation)
+2. [Hyperparameters for pruning](#hyperparameters)
+  - [Block sparsity](#block-sparsity)
+3. [Adding pruning ops to the training graph](#adding-pruning-ops)
+4. [Removing pruning ops from trained model](#remove)
+5. [Example](#example)
+
+### Model creation <a name="model-creation"></a>
 
 The first step involves adding mask and threshold variables to the layers that
 need to undergo pruning. The variable mask is the same shape as the layer's
@@ -33,7 +41,7 @@
 
 *   [rnn_cells.MaskedLSTMCell](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py?l=154)
 
-### Adding pruning ops to the training graph
+### Pruning-related hyperparameters <a name="hyperparameters"></a>
 
 The pruning library allows for specification of the following hyper parameters:
 
@@ -64,7 +72,13 @@
 t$$). $$s_f$$ is the target_sparsity, $$s_i$$ is the initial_sparsity, $$t_0$$
 is the sparsity_function_begin_step. In this equation, the
 sparsity_function_exponent is set to 3.
-### Adding pruning ops to the training graph
+
+#### Block Sparsity <a name="block-sparsity"></a>
+
+For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is only supported for weight tensors which can be squeezed to rank 2. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
+The convolution layer tensors are always pruned used block dimensions of [1,1].
+
+### Adding pruning ops to the training graph <a name="adding-pruning-ops"></a>
 
 The final step involves adding ops to the training graph that monitor the
 distribution of the layer's weight magnitudes and determine the layer threshold,
@@ -105,7 +119,19 @@
 ```
 Ensure that `global_step` is being [incremented](https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#minimize), otherwise pruning will not work!
 
-## Example: Pruning and training deep CNNs on the cifar10 dataset
+### Removing pruning ops from the trained graph <a name="remove"></a>
+Once the model is trained, it is necessary to remove the auxiliary variables (mask, threshold) and pruning ops added to the graph in the steps above. This can be accomplished using the `strip_pruning_vars` utility.
+
+This utility generates a binary GraphDef in which the variables have been converted to constants. In particular, the threshold variables are removed from the graph and the mask variable is fused with the corresponding weight tensor to produce a `masked_weight` tensor. This tensor is sparse, has the same size as the weight tensor, and the sparsity is as set by the `target_sparsity` or the `weight_sparsity_map` hyperparameters above.
+
+```shell
+$ bazel build -c opt contrib/model_pruning:strip_pruning_vars
+$ bazel-bin/contrib/model_pruning/strip_pruning_vars --checkpoint_dir=/path/to/checkpoints/ --output_node_names=graph_node1,graph_node2 --output_dir=/tmp --filename=pruning_stripped.pb
+```
+
+For now, it is assumed that the underlying hardware platform will provide mechanisms for compressing the sparse tensors and/or accelerating the sparse tensor computations.
+
+## Example: Pruning and training deep CNNs on the cifar10 dataset <a name="example"></a>
 
 Please see https://www.tensorflow.org/tutorials/deep_cnn for details on neural
 network architecture, setting up inputs etc. The additional changes needed to
@@ -121,7 +147,7 @@
 
 To train the pruned version of cifar10:
 
-```bash
+```shell
 $ examples_dir=contrib/model_pruning/examples
 $ bazel build -c opt $examples_dir/cifar10:cifar10_{train,eval}
 $ bazel-bin/$examples_dir/cifar10/cifar10_train --pruning_hparams=name=cifar10_pruning,begin_pruning_step=10000,end_pruning_step=100000,target_sparsity=0.9,sparsity_function_begin_step=10000,sparsity_function_end_step=100000
@@ -133,10 +159,14 @@
 $ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once
 ```
 
-### Block Sparsity
+Removing pruning nodes from the trained graph:
 
-For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is only supported for weight tensors which can be squeezed to rank 2. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
-The convolution layer tensors are always pruned used block dimensions of [1,1].
+```shell
+$ bazel build -c opt contrib/model_pruning:strip_pruning_vars
+$ bazel-bin/contrib/model_pruning/strip_pruning_vars --checkpoint_path=/tmp/cifar10_train --output_node_names=softmax_linear/softmax_linear_2 --filename=cifar_pruned.pb
+```
+
+The generated GraphDef (cifar_pruned.pb) may be visualized using the [`import_pb_to_tensorboard`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/tools/import_pb_to_tensorboard.py) utility
 
 ## References
 
diff --git a/tensorflow/contrib/model_pruning/__init__.py b/tensorflow/contrib/model_pruning/__init__.py
index d32bedb..6eca54a 100644
--- a/tensorflow/contrib/model_pruning/__init__.py
+++ b/tensorflow/contrib/model_pruning/__init__.py
@@ -33,6 +33,9 @@
 from tensorflow.contrib.model_pruning.python.pruning import get_weight_sparsity
 from tensorflow.contrib.model_pruning.python.pruning import get_weights
 from tensorflow.contrib.model_pruning.python.pruning import Pruning
+from tensorflow.contrib.model_pruning.python.strip_pruning_vars_lib import graph_def_from_checkpoint
+from tensorflow.contrib.model_pruning.python.strip_pruning_vars_lib import strip_pruning_vars_fn
+
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -41,7 +44,8 @@
     'masked_convolution', 'masked_conv2d', 'masked_fully_connected',
     'MaskedBasicLSTMCell', 'MaskedLSTMCell', 'train', 'apply_mask',
     'get_masked_weights', 'get_masks', 'get_pruning_hparams', 'get_thresholds',
-    'get_weights', 'get_weight_sparsity', 'Pruning'
+    'get_weights', 'get_weight_sparsity', 'Pruning', 'strip_pruning_vars_fn',
+    'graph_def_from_checkpoint'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
index 466daf2..d453e35 100644
--- a/tensorflow/contrib/model_pruning/python/layers/layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -139,7 +139,7 @@
       with "NC".
     num_outputs: Integer, the number of output filters.
     kernel_size: A sequence of N positive integers specifying the spatial
-      dimensions of of the filters.  Can be a single integer to specify the same
+      dimensions of the filters.  Can be a single integer to specify the same
       value for all spatial dimensions.
     stride: A sequence of N positive integers specifying the stride at which to
       compute output.  Can be a single integer to specify the same value for all
diff --git a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
index a5b050d..5f6c6ae 100644
--- a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
+++ b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
@@ -48,7 +48,7 @@
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
 
-  For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
+  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
   that follows.
   """
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 723dab9..cd58526e 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -237,6 +237,9 @@
     # Pruning specification
     self._spec = spec if spec else get_pruning_hparams()
 
+    # Sanity check for pruning hparams
+    self._validate_spec()
+
     # A tensorflow variable that tracks the sparsity function.
     # If not provided as input, the graph must already contain the global_step
     # variable before calling this constructor.
@@ -262,6 +265,34 @@
     # Mapping of weight names and target sparsity
     self._weight_sparsity_map = self._get_weight_sparsity_map()
 
+  def _validate_spec(self):
+    spec = self._spec
+    if spec.begin_pruning_step < 0:
+      raise ValueError('Illegal value for begin_pruning_step')
+
+    if spec.begin_pruning_step >= spec.end_pruning_step:
+      if spec.end_pruning_step != -1:
+        raise ValueError(
+            'Pruning must begin before it can end. begin_step=%d, end_step=%d.'
+            'Set end_pruning_step to -1 if pruning is required till training'
+            'stops' % (spec.begin_pruning_step, spec.end_pruning_step))
+
+    if spec.sparsity_function_begin_step < 0:
+      raise ValueError('Illegal value for sparsity_function_begin_step')
+
+    if spec.sparsity_function_begin_step >= spec.sparsity_function_end_step:
+      raise ValueError(
+          'Sparsity function requires begin_step < end_step')
+
+    if not 0.0 <= spec.threshold_decay < 1.0:
+      raise ValueError('threshold_decay must be in range [0,1)')
+
+    if not 0.0 <= spec.initial_sparsity < 1.0:
+      raise ValueError('initial_sparsity must be in range [0,1)')
+
+    if not 0.0 <= spec.target_sparsity < 1.0:
+      raise ValueError('target_sparsity must be in range [0,1)')
+
   def _setup_global_step(self, global_step):
     graph_global_step = global_step
     if graph_global_step is None:
@@ -276,11 +307,6 @@
     target_sparsity = self._spec.target_sparsity
     exponent = self._spec.sparsity_function_exponent
 
-    if begin_step >= end_step:
-      raise ValueError(
-          'Pruning must begin before it can end. begin_step=%d, end_step=%d' %
-          (begin_step, end_step))
-
     with ops.name_scope(self._spec.name):
       p = math_ops.minimum(
           1.0,
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 5b67656..33c4ad5 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -60,7 +60,6 @@
     self.assertEqual(p._weight_sparsity_map["conv1"], 0.8)
     self.assertEqual(p._weight_sparsity_map["conv2/kernel"], 0.8)
 
-
   def testInitWithExternalSparsity(self):
     with self.test_session():
       p = pruning.Pruning(spec=self.pruning_hparams, sparsity=self.sparsity)
diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py
new file mode 100644
index 0000000..3385103
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Removes the auxiliary variables and ops added by the pruning library.
+
+Usage:
+
+bazel build tensorflow/contrib/model_pruning:strip_pruning_vars && \
+bazel-bin/tensorflow/contrib/model_pruning/strip_pruning_vars \
+--checkpoint_dir=/tmp/model_ckpts \
+--output_node_names=softmax \
+--output_dir=/tmp \
+--filename=pruning_stripped.pb
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+from tensorflow.contrib.model_pruning.python import strip_pruning_vars_lib
+from tensorflow.python.framework import graph_io
+from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
+
+FLAGS = None
+
+
+def strip_pruning_vars(checkpoint_dir, output_node_names, output_dir, filename):
+  """Remove pruning-related auxiliary variables and ops from the graph.
+
+  Accepts training checkpoints and produces a GraphDef in which the pruning vars
+  and ops have been removed.
+
+  Args:
+    checkpoint_dir: Path to the checkpoints.
+    output_node_names: The name of the output nodes, comma separated.
+    output_dir: Directory where to write the graph.
+    filename: Output GraphDef file name.
+
+  Returns:
+    None
+
+  Raises:
+    ValueError: if output_nodes_names are not provided.
+  """
+  if not output_node_names:
+    raise ValueError(
+        'Need to specify atleast 1 output node through output_node_names flag')
+  output_node_names = output_node_names.replace(' ', '').split(',')
+
+  initial_graph_def = strip_pruning_vars_lib.graph_def_from_checkpoint(
+      checkpoint_dir, output_node_names)
+
+  final_graph_def = strip_pruning_vars_lib.strip_pruning_vars_fn(
+      initial_graph_def, output_node_names)
+  graph_io.write_graph(final_graph_def, output_dir, filename, as_text=False)
+  logging.info('\nFinal graph written to %s', os.path.join(
+      output_dir, filename))
+
+
+def main(unused_args):
+  return strip_pruning_vars(FLAGS.checkpoint_dir, FLAGS.output_node_names,
+                            FLAGS.output_dir, FLAGS.filename)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--checkpoint_dir', type=str, default='', help='Path to the checkpoints.')
+  parser.add_argument(
+      '--output_node_names',
+      type=str,
+      default='',
+      help='The name of the output nodes, comma separated.')
+  parser.add_argument(
+      '--output_dir',
+      type=str,
+      default='/tmp',
+      help='Directory where to write the graph.')
+  parser.add_argument(
+      '--filename',
+      type=str,
+      default='pruning_stripped.pb',
+      help='Output \'GraphDef\' file name.')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py
new file mode 100644
index 0000000..fc4b108
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py
@@ -0,0 +1,142 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to remove pruning-related ops and variables from a GraphDef.
+"""
+
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
+
+
+def _node_name(tensor_name):
+  """Remove the trailing ':0' from the variable name."""
+  if ':' not in tensor_name:
+    return tensor_name
+
+  return tensor_name.split(':')[0]
+
+
+def _tensor_name(node_name):
+  """Appends the :0 in the op name to get the canonical tensor name."""
+  if ':' in node_name:
+    return node_name
+
+  return node_name + ':0'
+
+
+def _get_masked_weights(input_graph_def):
+  """Extracts masked_weights from the graph as a dict of {var_name:ndarray}."""
+  input_graph = ops.Graph()
+  with input_graph.as_default():
+    importer.import_graph_def(input_graph_def, name='')
+
+    with session.Session(graph=input_graph) as sess:
+      masked_weights_dict = {}
+      for node in input_graph_def.node:
+        if 'masked_weight' in node.name:
+          masked_weight_val = sess.run(
+              sess.graph.get_tensor_by_name(_tensor_name(node.name)))
+          logging.info(
+              '%s has %d values, %1.2f%% zeros \n', node.name,
+              np.size(masked_weight_val),
+              100 - float(100 * np.count_nonzero(masked_weight_val)) /
+              np.size(masked_weight_val))
+          masked_weights_dict.update({node.name: masked_weight_val})
+  return masked_weights_dict
+
+
+def strip_pruning_vars_fn(input_graph_def, output_node_names):
+  """Removes mask variable from the graph.
+
+  Replaces the masked_weight tensor with element-wise multiplication of mask
+  and the corresponding weight variable.
+
+  Args:
+    input_graph_def: A GraphDef in which the variables have been converted to
+      constants. This is typically the output of
+      tf.graph_util.convert_variables_to_constant()
+    output_node_names: List of name strings for the result nodes of the graph
+
+  Returns:
+    A GraphDef in which pruning-related variables have been removed
+  """
+  masked_weights_dict = _get_masked_weights(input_graph_def)
+  pruned_graph_def = graph_pb2.GraphDef()
+
+  # Replace masked_weight with a const op containing the
+  # result of tf.multiply(mask,weight)
+  for node in input_graph_def.node:
+    output_node = node_def_pb2.NodeDef()
+    if 'masked_weight' in node.name:
+      output_node.op = 'Const'
+      output_node.name = node.name
+      dtype = node.attr['T']
+      data = masked_weights_dict[node.name]
+      output_node.attr['dtype'].CopyFrom(dtype)
+      output_node.attr['value'].CopyFrom(
+          attr_value_pb2.AttrValue(tensor=tensor_util.make_tensor_proto(data)))
+
+    else:
+      output_node.CopyFrom(node)
+    pruned_graph_def.node.extend([output_node])
+
+  # Remove stranded nodes: mask and weights
+  return graph_util.extract_sub_graph(pruned_graph_def, output_node_names)
+
+
+def graph_def_from_checkpoint(checkpoint_dir, output_node_names):
+  """Converts checkpoint data to GraphDef.
+
+  Reads the latest checkpoint data and produces a GraphDef in which the
+  variables have been converted to constants.
+
+  Args:
+    checkpoint_dir: Path to the checkpoints.
+    output_node_names: List of name strings for the result nodes of the graph.
+
+  Returns:
+    A GraphDef from the latest checkpoint
+
+  Raises:
+    ValueError: if no checkpoint is found
+  """
+  checkpoint_path = saver_lib.latest_checkpoint(checkpoint_dir)
+  if checkpoint_path is None:
+    raise ValueError('Could not find a checkpoint at: {0}.'
+                     .format(checkpoint_dir))
+
+  saver_for_restore = saver_lib.import_meta_graph(
+      checkpoint_path + '.meta', clear_devices=True)
+  with session.Session() as sess:
+    saver_for_restore.restore(sess, checkpoint_path)
+    graph_def = ops.get_default_graph().as_graph_def()
+    output_graph_def = graph_util.convert_variables_to_constants(
+        sess, graph_def, output_node_names)
+
+  return output_graph_def
diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py
new file mode 100644
index 0000000..255daa0
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py
@@ -0,0 +1,232 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for strip_pruning_vars."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.contrib.model_pruning.python import pruning
+from tensorflow.contrib.model_pruning.python import strip_pruning_vars_lib
+from tensorflow.contrib.model_pruning.python.layers import layers
+from tensorflow.contrib.model_pruning.python.layers import rnn_cells
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell as tf_rnn_cells
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
+
+
+def _get_number_pruning_vars(graph_def):
+  number_vars = 0
+  for node in graph_def.node:
+    if re.match(r"^.*(mask$)|(threshold$)", node.name):
+      number_vars += 1
+  return number_vars
+
+
+def _get_node_names(tensor_names):
+  return [
+      strip_pruning_vars_lib._node_name(tensor_name)
+      for tensor_name in tensor_names
+  ]
+
+
+class StripPruningVarsTest(test.TestCase):
+
+  def setUp(self):
+    param_list = [
+        "pruning_frequency=1", "begin_pruning_step=1", "end_pruning_step=10",
+        "nbins=2048", "threshold_decay=0.0"
+    ]
+    self.initial_graph = ops.Graph()
+    self.initial_graph_def = None
+    self.final_graph = ops.Graph()
+    self.final_graph_def = None
+    self.pruning_spec = ",".join(param_list)
+    with self.initial_graph.as_default():
+      self.sparsity = variables.Variable(0.5, name="sparsity")
+      self.global_step = training_util.get_or_create_global_step()
+      self.increment_global_step = state_ops.assign_add(self.global_step, 1)
+      self.mask_update_op = None
+
+  def _build_convolutional_model(self, number_of_layers):
+    # Create a graph with several conv2d layers
+    kernel_size = 3
+    base_depth = 4
+    depth_step = 7
+    height, width = 7, 9
+    with variable_scope.variable_scope("conv_model"):
+      input_tensor = array_ops.ones((8, height, width, base_depth))
+      top_layer = input_tensor
+      for ix in range(number_of_layers):
+        top_layer = layers.masked_conv2d(
+            top_layer,
+            base_depth + (ix + 1) * depth_step,
+            kernel_size,
+            scope="Conv_" + str(ix))
+
+    return top_layer
+
+  def _build_fully_connected_model(self, number_of_layers):
+    base_depth = 4
+    depth_step = 7
+
+    input_tensor = array_ops.ones((8, base_depth))
+
+    top_layer = input_tensor
+
+    with variable_scope.variable_scope("fc_model"):
+      for ix in range(number_of_layers):
+        top_layer = layers.masked_fully_connected(
+            top_layer, base_depth + (ix + 1) * depth_step)
+
+    return top_layer
+
+  def _build_lstm_model(self, number_of_layers):
+    batch_size = 8
+    dim = 10
+    inputs = variables.Variable(random_ops.random_normal([batch_size, dim]))
+
+    def lstm_cell():
+      return rnn_cells.MaskedBasicLSTMCell(
+          dim, forget_bias=0.0, state_is_tuple=True, reuse=False)
+
+    cell = tf_rnn_cells.MultiRNNCell(
+        [lstm_cell() for _ in range(number_of_layers)], state_is_tuple=True)
+
+    outputs = rnn.static_rnn(
+        cell, [inputs],
+        initial_state=cell.zero_state(batch_size, dtypes.float32))
+
+    return outputs
+
+  def _prune_model(self, session):
+    pruning_hparams = pruning.get_pruning_hparams().parse(self.pruning_spec)
+    p = pruning.Pruning(pruning_hparams, sparsity=self.sparsity)
+    self.mask_update_op = p.conditional_mask_update_op()
+
+    variables.global_variables_initializer().run()
+    for _ in range(20):
+      session.run(self.mask_update_op)
+      session.run(self.increment_global_step)
+
+  def _get_outputs(self, session, input_graph, tensors_list, graph_prefix=None):
+    outputs = []
+
+    for output_tensor in tensors_list:
+      if graph_prefix:
+        output_tensor = graph_prefix + "/" + output_tensor
+      outputs.append(
+          session.run(session.graph.get_tensor_by_name(output_tensor)))
+
+    return outputs
+
+  def _get_initial_outputs(self, output_tensor_names_list):
+    with self.test_session(graph=self.initial_graph) as sess1:
+      self._prune_model(sess1)
+      reference_outputs = self._get_outputs(sess1, self.initial_graph,
+                                            output_tensor_names_list)
+
+      self.initial_graph_def = graph_util.convert_variables_to_constants(
+          sess1, sess1.graph.as_graph_def(),
+          _get_node_names(output_tensor_names_list))
+    return reference_outputs
+
+  def _get_final_outputs(self, output_tensor_names_list):
+    self.final_graph_def = strip_pruning_vars_lib.strip_pruning_vars_fn(
+        self.initial_graph_def, _get_node_names(output_tensor_names_list))
+    _ = importer.import_graph_def(self.final_graph_def, name="final")
+
+    with self.test_session(self.final_graph) as sess2:
+      final_outputs = self._get_outputs(
+          sess2,
+          self.final_graph,
+          output_tensor_names_list,
+          graph_prefix="final")
+    return final_outputs
+
+  def _check_removal_of_pruning_vars(self, number_masked_layers):
+    self.assertEqual(
+        _get_number_pruning_vars(self.initial_graph_def), number_masked_layers)
+    self.assertEqual(_get_number_pruning_vars(self.final_graph_def), 0)
+
+  def _check_output_equivalence(self, initial_outputs, final_outputs):
+    for initial_output, final_output in zip(initial_outputs, final_outputs):
+      self.assertAllEqual(initial_output, final_output)
+
+  def testConvolutionalModel(self):
+    with self.initial_graph.as_default():
+      number_masked_conv_layers = 5
+      top_layer = self._build_convolutional_model(number_masked_conv_layers)
+      output_tensor_names = [top_layer.name]
+      initial_outputs = self._get_initial_outputs(output_tensor_names)
+
+    # Remove pruning-related nodes.
+    with self.final_graph.as_default():
+      final_outputs = self._get_final_outputs(output_tensor_names)
+
+    # Check that the final graph has no pruning-related vars
+    self._check_removal_of_pruning_vars(number_masked_conv_layers)
+
+    # Check that outputs remain the same after removal of pruning-related nodes
+    self._check_output_equivalence(initial_outputs, final_outputs)
+
+  def testFullyConnectedModel(self):
+    with self.initial_graph.as_default():
+      number_masked_fc_layers = 3
+      top_layer = self._build_fully_connected_model(number_masked_fc_layers)
+      output_tensor_names = [top_layer.name]
+      initial_outputs = self._get_initial_outputs(output_tensor_names)
+
+    # Remove pruning-related nodes.
+    with self.final_graph.as_default():
+      final_outputs = self._get_final_outputs(output_tensor_names)
+
+    # Check that the final graph has no pruning-related vars
+    self._check_removal_of_pruning_vars(number_masked_fc_layers)
+
+    # Check that outputs remain the same after removal of pruning-related nodes
+    self._check_output_equivalence(initial_outputs, final_outputs)
+
+  def testLSTMModel(self):
+    with self.initial_graph.as_default():
+      number_masked_lstm_layers = 2
+      outputs = self._build_lstm_model(number_masked_lstm_layers)
+      output_tensor_names = [outputs[0][0].name]
+      initial_outputs = self._get_initial_outputs(output_tensor_names)
+
+    # Remove pruning-related nodes.
+    with self.final_graph.as_default():
+      final_outputs = self._get_final_outputs(output_tensor_names)
+
+    # Check that the final graph has no pruning-related vars
+    self._check_removal_of_pruning_vars(number_masked_lstm_layers)
+
+    # Check that outputs remain the same after removal of pruning-related nodes
+    self._check_output_equivalence(initial_outputs, final_outputs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 57a96c5..09fad35 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -20,6 +20,13 @@
 #include <unordered_map>
 #include <vector>
 
+// TODO(rmlarsen): Get rid of this workaround. "gpu_assert" is defined when
+// setting EIGEN_USE_THREADS. But when defining EIGEN_USE_THREADS here,
+// incAtomic and other CUDA specific symbols are no longer recognized.
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+
 #include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
index 2f92d05..98f4264 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
@@ -43,7 +43,7 @@
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
     name: A name for this operation (optional).
 
   Returns:
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index e659256..de71b08 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -123,15 +123,15 @@
   """Computes softmax loss using rank-based adaptive resampling.
 
   This has been shown to improve rank loss after training compared to
-  @{tf.nn.sampled_softmax_loss}. For a description of the algorithm and some
+  `tf.nn.sampled_softmax_loss`. For a description of the algorithm and some
   experimental results, please see: [TAPAS: Two-pass Approximate Adaptive
   Sampling for Softmax](https://arxiv.org/abs/1707.03073).
 
   Sampling follows two phases:
   * In the first phase, `num_sampled` classes are selected using
-    @{tf.nn.learned_unigram_candidate_sampler} or supplied `sampled_values`.
+    `tf.nn.learned_unigram_candidate_sampler` or supplied `sampled_values`.
     The logits are calculated on those sampled classes. This phases is
-    similar to @{tf.nn.sampled_softmax_loss}.
+    similar to `tf.nn.sampled_softmax_loss`.
   * In the second phase, the `num_resampled` classes with highest predicted
     probability are kept. Probabilities are
     `LogSumExp(logits / resampling_temperature)`, where the sum is over
@@ -142,7 +142,7 @@
   picks more candidates close to the predicted classes. A common strategy is
   to decrease the temperature as training proceeds.
 
-  See @{tf.nn.sampled_softmax_loss} for more documentation on sampling and
+  See `tf.nn.sampled_softmax_loss` for more documentation on sampling and
   for typical default values for some of the parameters.
 
   This operation is for training only. It is generally an underestimate of
@@ -197,7 +197,7 @@
         where a sampled class equals one of the target classes.
     partition_strategy: A string specifying the partitioning strategy, relevant
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
-        See @{tf.nn.embedding_lookup} for more details.
+        See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
 
   Returns:
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 280d4a5..5319a8b 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -20,6 +20,7 @@
         "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/ggt.py",
+        "python/training/lars_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/model_average_optimizer.py",
         "python/training/moving_average_optimizer.py",
@@ -348,6 +349,7 @@
 
 py_test(
     name = "shampoo_test",
+    size = "large",
     srcs = ["python/training/shampoo_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -361,5 +363,21 @@
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "lars_optimizer_test",
+    srcs = ["python/training/lars_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 9471fb0..781621d 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -24,6 +24,7 @@
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
+from tensorflow.contrib.opt.python.training.lars_optimizer import *
 from tensorflow.contrib.opt.python.training.ggt import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
 from tensorflow.contrib.opt.python.training.model_average_optimizer import *
@@ -46,6 +47,7 @@
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer',
     'ExternalOptimizerInterface',
+    'LARSOptimizer',
     'LazyAdamOptimizer',
     'NadamOptimizer',
     'MovingAverageOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
new file mode 100644
index 0000000..a8dafd9
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -0,0 +1,164 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class LARSOptimizer(optimizer.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+
+  Implements the LARS learning rate scheme presented in the paper above. This
+  optimizer is useful when scaling the batch size to up to 32K without
+  significant performance degradation. It is recommended to use the optimizer
+  in conjunction with:
+      - Gradual learning rate warm-up
+      - Linear learning rate scaling
+      - Poly rule learning rate decay
+
+  Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
+  use the default momentum optimizer.
+  """
+
+  def __init__(
+      self,
+      learning_rate,
+      momentum=0.9,
+      weight_decay=0.0001,
+      # The LARS coefficient is a hyperparameter
+      eeta=0.001,
+      epsilon=0.0,
+      name="LARSOptimizer",
+      # Enable skipping variables from LARS scaling.
+      # TODO(sameerkm): Enable a direct mechanism to pass a
+      # subset of variables to the optimizer.
+      skip_list=None,
+      use_nesterov=False):
+    """Construct a new LARS Optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or floating point value. The base learning rate.
+      momentum: A floating point value. Momentum hyperparameter.
+      weight_decay: A floating point value. Weight decay hyperparameter.
+      eeta: LARS coefficient as used in the paper. Dfault set to LARS
+        coefficient from the paper. (eeta / weight_decay) determines the highest
+        scaling factor in LARS.
+      epsilon: Optional epsilon parameter to be set in models that have very
+        small gradients. Default set to 0.0.
+      name: Optional name prefix for variables and ops created by LARSOptimizer.
+      skip_list: List of strings to enable skipping variables from LARS scaling.
+        If any of the strings in skip_list is a subset of var.name, variable
+        'var' is skipped from LARS scaling. For a typical classification model
+        with batch normalization, the skip_list is ['batch_normalization',
+        'bias']
+      use_nesterov: when set to True, nesterov momentum will be enabled
+
+    Raises:
+      ValueError: If a hyperparameter is set to a non-sensical value.
+    """
+    if momentum < 0.0:
+      raise ValueError("momentum should be positive: %s" % momentum)
+    if weight_decay < 0.0:
+      raise ValueError("weight_decay should be positive: %s" % weight_decay)
+    super(LARSOptimizer, self).__init__(use_locking=False, name=name)
+
+    self._learning_rate = learning_rate
+    self._momentum = momentum
+    self._weight_decay = weight_decay
+    self._eeta = eeta
+    self._epsilon = epsilon
+    self._name = name
+    self._skip_list = skip_list
+    self._use_nesterov = use_nesterov
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self._zeros_slot(v, "momentum", self._name)
+
+  def compute_lr(self, grad, var):
+    scaled_lr = self._learning_rate
+    if self._skip_list is None or not any(v in var.name
+                                          for v in self._skip_list):
+      w_norm = linalg_ops.norm(var, ord=2)
+      g_norm = linalg_ops.norm(grad, ord=2)
+      trust_ratio = array_ops.where(
+          math_ops.greater(w_norm, 0),
+          array_ops.where(
+              math_ops.greater(g_norm, 0),
+              (self._eeta * w_norm /
+               (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0),
+          1.0)
+      scaled_lr = self._learning_rate * trust_ratio
+    return scaled_lr
+
+  def _apply_dense(self, grad, var):
+    scaled_lr = self.compute_lr(grad, var)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        scaled_lr,
+        grad,
+        self._momentum,
+        use_locking=False,
+        use_nesterov=self._use_nesterov)
+
+  def _resource_apply_dense(self, grad, var):
+    scaled_lr = self.compute_lr(grad, var)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_apply_momentum(
+        var.handle,
+        mom.handle,
+        scaled_lr,
+        grad,
+        self._momentum,
+        use_locking=False,
+        use_nesterov=self._use_nesterov)
+
+  # Fallback to momentum optimizer for sparse tensors
+  def _apply_sparse(self, grad, var):
+    mom = self.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        mom.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype),
+        grad,
+        indices,
+        math_ops.cast(self._momentum_tensor, grad.dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer_test.py b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
new file mode 100644
index 0000000..d94249b
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0. Licensed to the Apache
+# Software Foundation. You may not use this file except in compliance with the
+# License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for Layer-wise Adaptive Rate Scaling optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import lars_optimizer as lo
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class LARSOptimizerTest(test.TestCase):
+
+  def testLARSGradientOneStep(self):
+    for _ in range(10):
+      for dtype in [dtypes.float32, dtypes.float64]:
+        with self.test_session() as sess:
+          shape = [3, 3]
+          var_np = np.ones(shape)
+          grad_np = np.ones(shape)
+          lr_np = 0.1
+          m_np = 0.9
+          wd_np = 0.1
+          ep_np = 1e-5
+          eeta = 0.1
+          vel_np = np.zeros(shape)
+
+          var = variables.Variable(var_np, dtype=dtype)
+          grad = variables.Variable(grad_np, dtype=dtype)
+          opt = lo.LARSOptimizer(
+              learning_rate=lr_np,
+              momentum=m_np,
+              weight_decay=wd_np,
+              eeta=eeta,
+              epsilon=ep_np)
+
+          step = opt.apply_gradients([(grad, var)])
+          variables.global_variables_initializer().run()
+
+          pre_var = sess.run(var)
+          pre_vel = sess.run(opt.get_slot(var, 'momentum'))
+          self.assertAllClose(var_np, pre_var)
+          self.assertAllClose(vel_np, pre_vel)
+
+          step.run()
+          post_var = sess.run(var)
+          post_vel = sess.run(opt.get_slot(var, 'momentum'))
+
+          w_norm = np.linalg.norm(var_np.flatten(), ord=2)
+          g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
+          trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
+          scaled_lr = lr_np * trust_ratio
+
+          vel_np = m_np * vel_np + grad_np
+          var_np -= scaled_lr * vel_np
+
+          self.assertAllClose(var_np, post_var)
+          self.assertAllClose(vel_np, post_vel)
+
+  def testLARSGradientMultiStep(self):
+    for _ in range(10):
+      for dtype in [dtypes.float32, dtypes.float64]:
+        with self.test_session() as sess:
+          shape = [3, 3]
+          var_np = np.ones(shape)
+          grad_np = np.ones(shape)
+          lr_np = 0.1
+          m_np = 0.9
+          wd_np = 0.1
+          ep_np = 1e-5
+          eeta = 0.1
+          vel_np = np.zeros(shape)
+
+          var = variables.Variable(var_np, dtype=dtype)
+          grad = variables.Variable(grad_np, dtype=dtype)
+          opt = lo.LARSOptimizer(
+              learning_rate=lr_np,
+              momentum=m_np,
+              eeta=eeta,
+              weight_decay=wd_np,
+              epsilon=ep_np)
+
+          step = opt.apply_gradients([(grad, var)])
+          variables.global_variables_initializer().run()
+
+          pre_var = sess.run(var)
+          pre_vel = sess.run(opt.get_slot(var, 'momentum'))
+          self.assertAllClose(var_np, pre_var)
+          self.assertAllClose(vel_np, pre_vel)
+
+          for _ in range(10):
+            step.run()
+
+            post_var = sess.run(var)
+            post_vel = sess.run(opt.get_slot(var, 'momentum'))
+
+            w_norm = np.linalg.norm(var_np.flatten(), ord=2)
+            g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
+            trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
+            scaled_lr = lr_np * trust_ratio
+
+            vel_np = m_np * vel_np + grad_np
+            var_np -= scaled_lr * vel_np
+
+            self.assertAllClose(var_np, post_var)
+            self.assertAllClose(vel_np, post_vel)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/shampoo.py b/tensorflow/contrib/opt/python/training/shampoo.py
index 7afa099..294627f 100644
--- a/tensorflow/contrib/opt/python/training/shampoo.py
+++ b/tensorflow/contrib/opt/python/training/shampoo.py
@@ -66,8 +66,9 @@
   a lambda function that depends on step.
   """
 
-  def __init__(self, global_step=0,
-               max_matrix_size=500,
+  def __init__(self,
+               global_step=0,
+               max_matrix_size=768,
                gbar_decay=0.0,
                gbar_weight=1.0,
                mat_gbar_decay=1.0,
@@ -138,7 +139,7 @@
         shape = np.array(v.get_shape())
         for i, d in enumerate(shape):
           d_tensor = ops.convert_to_tensor(d)
-          if d < self._max_matrix_size:
+          if d <= self._max_matrix_size:
             mat_g_init = array_ops.zeros_like(linalg_ops.eye(d_tensor))
             if self._svd_interval > 1:
               _ = self._get_or_make_slot(v, linalg_ops.eye(d_tensor),
@@ -149,18 +150,27 @@
           _ = self._get_or_make_slot(v, mat_g_init, "Gbar_" + str(i),
                                      self._name)
 
+  def _resource_apply_dense(self, grad, var):
+    return self._apply_dense(grad, var)
+
   def _apply_dense(self, grad, var):
     return self._apply_gradient(grad, var)
 
+  def _resource_apply_sparse(self, grad_values, var, grad_indices):
+    return self._apply_sparse_shared(grad_values, grad_indices, var)
+
   def _apply_sparse(self, grad, var):
-    if var.get_shape()[0] < self._max_matrix_size or self._gbar_decay != 0.0:
+    return self._apply_sparse_shared(grad.values, grad.indices, var)
+
+  def _apply_sparse_shared(self, grad_values, grad_indices, var):
+    if var.get_shape()[0] <= self._max_matrix_size or self._gbar_decay != 0.0:
       # The dimension is small enough, we can make the variable dense and
       # do a dense update
       dense_grad = array_ops.scatter_nd(
-          array_ops.expand_dims(grad.indices, axis=1),
-          grad.values, array_ops.shape(var, out_type=grad.indices.dtype))
+          array_ops.expand_dims(grad_indices, axis=1), grad_values,
+          array_ops.shape(var, out_type=grad_indices.dtype))
       return self._apply_gradient(dense_grad, var)
-    return self._apply_gradient(grad.values, var, grad.indices)
+    return self._apply_gradient(grad_values, var, grad_indices)
 
   def _weighted_average(self, var, weight, weight_t, rest):
     """Computes exponential weighted average: var = weight_t * var + rest.
@@ -304,7 +314,7 @@
       mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
     else:
       damped_mat_g = mat_g + self._epsilon * identity
-      z = (1 - 1/alpha) / (2 * linalg_ops.norm(damped_mat_g, ord=2))
+      z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g))
       # The best value for z is
       # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
       #                 (c_max^{1-alpha} - c_min^{1-alpha})
@@ -326,12 +336,13 @@
 
   def _compute_power(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name=None):
     """Just a switch between the iterative power vs svd."""
-    if self._use_iterative_root:
-      return self._compute_power_iter(var, mat_g, mat_g_size, alpha,
-                                      mat_h_slot_name)
-    else:
-      return self._compute_power_svd(var, mat_g, mat_g_size, alpha,
-                                     mat_h_slot_name)
+    with ops.name_scope("matrix_iterative_power"):
+      if self._use_iterative_root:
+        return self._compute_power_iter(var, mat_g, mat_g_size, alpha,
+                                        mat_h_slot_name)
+      else:
+        return self._compute_power_svd(var, mat_g, mat_g_size, alpha,
+                                       mat_h_slot_name)
 
   def _apply_gradient(self, grad, var, indices=None):
     """The main function to update a variable.
@@ -397,7 +408,7 @@
     for i, mat_g in enumerate(mat_g_list):
       # axes is the list of indices to reduce - everything but the current i.
       axes = list(range(i)) + list(range(i+1, v_rank))
-      if shape[i] < self._max_matrix_size:
+      if shape[i] <= self._max_matrix_size:
         # If the tensor size is sufficiently small perform full Shampoo update
         # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
         # is not strictly correct. However we will use it for now, and
@@ -455,8 +466,8 @@
     # Update the variable based on the Shampoo update
     learning_rate_t = GetParam(self._learning_rate, global_step)
     if indices is not None:
-      var_updated = state_ops.scatter_sub(var, indices,
-                                          learning_rate_t * preconditioned_grad)
+      var_updated = state_ops.scatter_add(
+          var, indices, -learning_rate_t * preconditioned_grad)
     else:
       var_updated = state_ops.assign_sub(var,
                                          learning_rate_t * preconditioned_grad)
diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py
index 3148d02..2e0a202 100644
--- a/tensorflow/contrib/opt/python/training/shampoo_test.py
+++ b/tensorflow/contrib/opt/python/training/shampoo_test.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.opt.python.training import shampoo
@@ -40,9 +41,10 @@
   return np.dot(np.dot(mat_u, np.diag(diag_d)), mat_v)
 
 
-class ShampooTest(test.TestCase):
+class ShampooTest(test.TestCase, parameterized.TestCase):
 
-  def testBasicVector(self):
+  @parameterized.named_parameters(('Var', False), ('ResourceVar', True))
+  def testBasicVector(self, use_resource_var):
     """Similar to the full Adagrad update."""
 
     size = 20
@@ -51,8 +53,10 @@
     grad_np_2 = np.random.rand(size)
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -91,7 +95,8 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testBasicMatrix(self):
+  @parameterized.named_parameters(('Var', False), ('ResourceVar', True))
+  def testBasicMatrix(self, use_resource_var):
     """Check update when gradient is a matrix."""
     size = [10, 5]
     init_var_np = np.zeros(size)
@@ -99,8 +104,10 @@
     grad_np_2 = np.random.rand(size[0], size[1])
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -143,16 +150,23 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def _testBasicTensor(self, use_iterative_root):
-    """Check update when gradient is a tensor."""
+  def _testBasicTensor(self, use_iterative_root, use_resource_var):
+    """Check update when gradient is a tensor.
+
+    Args:
+      use_iterative_root: use iterative power method or SVD to find nth roots.
+      use_resource_var: use resource var as variables.
+    """
     size = [10, 5, 7]
     init_var_np = np.zeros(size)
     grad_np = np.random.rand(size[0], size[1], size[2])
     grad_np_2 = np.random.rand(size[0], size[1], size[2])
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -208,11 +222,17 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testBasicTensor(self):
-    for use_iterative_root in [True, False]:
-      self._testBasicTensor(use_iterative_root)
+  @parameterized.named_parameters(
+      ('SVDWithVar', False, False),
+      ('SVDWithResourceVar', False, True),
+      ('IterRootWithVar', True, False),
+      ('IterRootWithResourceVar', True, True),
+  )
+  def testBasicTensor(self, use_iterative_root, use_resource_var):
+    self._testBasicTensor(use_iterative_root, use_resource_var)
 
-  def testLargeVector(self):
+  @parameterized.named_parameters(('Var', False), ('ResourceVar', True))
+  def testLargeVector(self, use_resource_var):
     """This is just the diagonal Adagrad update."""
 
     size = 2000
@@ -221,8 +241,10 @@
     grad_np_2 = np.random.rand(size)
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -257,10 +279,14 @@
 
       self.assertAllCloseAccordingToType(new_val_np, new_val)
 
-  def testLargeMatrix(self):
+  @parameterized.named_parameters(('Var', False), ('ResourceVar', True))
+  def testLargeMatrix(self, use_resource_var):
     """Gradient is a matrix, one of whose dimensions is large.
 
     We do diagonal updates for large dimensions.
+
+    Args:
+      use_resource_var: use resource var as variables.
     """
 
     size = [2000, 3]
@@ -269,8 +295,10 @@
     grad_np_2 = np.random.rand(size[0], size[1])
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -316,12 +344,15 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testSparseUpdateLarge(self):
+  @parameterized.named_parameters(('Var', False))
+  def testSparseUpdateLarge(self, use_resource_var):
     """Check update when gradient is of type IndexSlices.
 
     We do diagonal updates for the first dimension, unless it is very small.
-    """
 
+    Args:
+      use_resource_var: use resource var as variables.
+    """
     size = [2000, 3]
     sample_size_1 = 100
     init_var_np = np.zeros(size)
@@ -335,8 +366,10 @@
     grad_np_2 = np.random.rand(sample_size_2, size[1])
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = ops.IndexedSlices(
           constant_op.constant(grad_np, dtype=dtypes.float32),
           constant_op.constant(grad_indices),
@@ -395,13 +428,14 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def _testSparseUpdateSmall(self, use_iterative_root):
+  def _testSparseUpdateSmall(self, use_iterative_root, use_resource_var):
     """Gradient is of type IndexSlices, but the first dimension is small.
 
     We create dense gradient and do the full update with SVD etc.
 
     Args:
       use_iterative_root: use iterative power method or SVD to find nth roots.
+      use_resource_var: use resource var as variables.
     """
 
     size = [100, 3, 5]
@@ -412,8 +446,10 @@
     grad_np = np.random.rand(sample_size, size[1], size[2])
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = ops.IndexedSlices(
           constant_op.constant(grad_np, dtype=dtypes.float32),
           constant_op.constant(grad_indices),
@@ -453,15 +489,21 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testSparseUpdateSmall(self):
-    for use_iterative_root in [True, False]:
-      self._testSparseUpdateSmall(use_iterative_root)
+  @parameterized.named_parameters(
+      ('SVDWithVar', False, False),
+      ('SVDWithResourceVar', False, True),
+      ('IterRootWithVar', True, False),
+      ('IterRootWithResourceVar', True, True),
+  )
+  def testSparseUpdateSmall(self, use_iterative_root, use_resource_var):
+    self._testSparseUpdateSmall(use_iterative_root, use_resource_var)
 
-  def _testBasicTensorWithMomentum(self, use_iterative_root):
+  def _testBasicTensorWithMomentum(self, use_iterative_root, use_resource_var):
     """Check update with momentum when gradient is a tensor.
 
     Args:
       use_iterative_root: use iterative power method or SVD to find nth roots.
+      use_resource_var: use resource var as variables.
     """
     size = [10, 5, 7]
     init_var_np = np.zeros(size)
@@ -471,8 +513,10 @@
     gbar_weight = 0.1
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = constant_op.constant(grad_np, dtype=dtypes.float32)
       grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)
 
@@ -528,15 +572,21 @@
       self.assertAllCloseAccordingToType(new_val_np, new_val,
                                          atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testBasicTensorWithMomentum(self):
-    for use_iterative_root in [True, False]:
-      self._testBasicTensorWithMomentum(use_iterative_root)
+  @parameterized.named_parameters(
+      ('SVDWithVar', False, False),
+      ('SVDWithResourceVar', False, True),
+      ('IterRootWithVar', True, False),
+      ('IterRootWithResourceVar', True, True),
+  )
+  def testBasicTensorWithMomentum(self, use_iterative_root, use_resource_var):
+    self._testBasicTensorWithMomentum(use_iterative_root, use_resource_var)
 
-  def _testDelayedSVD(self, use_iterative_root):
+  def _testDelayedSVD(self, use_iterative_root, use_resource_var):
     """Performing the SVD every nth step.
 
     Args:
       use_iterative_root: use iterative power method or SVD to find nth roots.
+      use_resource_var: use resource var as variables.
     """
     size = [10, 5, 7]
     init_var_np = np.zeros(size).astype(np.float32)
@@ -552,8 +602,10 @@
     mat_g3 = np.zeros_like(mat_g3_a)
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = array_ops.placeholder(dtypes.float32, shape=size)
 
       opt = shampoo.ShampooOptimizer(global_step, svd_interval=svd_interval,
@@ -590,15 +642,21 @@
         self.assertAllCloseAccordingToType(new_val_np, new_val,
                                            atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testDelayedSVD(self):
-    for use_iterative_root in [True, False]:
-      self._testDelayedSVD(use_iterative_root)
+  @parameterized.named_parameters(
+      ('SVDWithVar', False, False),
+      ('SVDWithResourceVar', False, True),
+      ('IterRootWithVar', True, False),
+      ('IterRootWithResourceVar', True, True),
+  )
+  def testDelayedSVD(self, use_iterative_root, use_resource_var):
+    self._testDelayedSVD(use_iterative_root, use_resource_var)
 
-  def _testDelayedPrecondUpdate(self, use_iterative_root):
+  def _testDelayedPrecondUpdate(self, use_iterative_root, use_resource_var):
     """Update the squared sum every nth step, drop the other steps.
 
     Args:
       use_iterative_root: use iterative power method or SVD to find nth roots.
+      use_resource_var: use resource var as variables.
     """
     size = [10, 5, 7]
     init_var_np = np.zeros(size).astype(np.float32)
@@ -615,8 +673,10 @@
     mat_g3 = np.zeros_like(mat_g3_a)
 
     with self.test_session() as sess:
-      global_step = variables.Variable(0, dtype=dtypes.int64)
-      var = variables.Variable(init_var_np, dtype=dtypes.float32)
+      global_step = variables.Variable(
+          0, dtype=dtypes.int64, use_resource=use_resource_var)
+      var = variables.Variable(
+          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
       grad = array_ops.placeholder(dtypes.float32, shape=size)
 
       opt = shampoo.ShampooOptimizer(
@@ -660,9 +720,14 @@
         self.assertAllCloseAccordingToType(new_val_np, new_val,
                                            atol=TOLERANCE, rtol=TOLERANCE)
 
-  def testDelayedPrecondUpdate(self):
-    for use_iterative_root in [True, False]:
-      self._testDelayedPrecondUpdate(use_iterative_root)
+  @parameterized.named_parameters(
+      ('SVDWithVar', False, False),
+      ('SVDWithResourceVar', False, True),
+      ('IterRootWithVar', True, False),
+      ('IterRootWithResourceVar', True, True),
+  )
+  def testDelayedPrecondUpdate(self, use_iterative_root, use_resource_var):
+    self._testDelayedPrecondUpdate(use_iterative_root, use_resource_var)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 8c11d8b..f6ecaba 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -34,6 +34,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -620,7 +621,7 @@
     # Map from graph_key to state for that graph. We use the graph_key
     # since it works in both eager and graph mode, and gives the outer
     # graph inside functions.
-    tower_context = distribute_lib.get_tower_context()
+    tower_context = distribution_strategy_context.get_tower_context()
     if tower_context is None:
       # In a cross-tower context for a DistributionStrategy, which means
       # only one Optimizer will be created, not one per tower.
@@ -769,7 +770,8 @@
               distribute_lib.get_loss_reduction() ==
               variable_scope.VariableAggregation.MEAN)
         if scale_loss_by_num_towers:
-          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          num_towers = distribution_strategy_context.get_distribution_strategy(
+          ).num_towers
           if num_towers > 1:
             loss_value *= 1. / num_towers
 
@@ -788,7 +790,8 @@
           distribute_lib.get_loss_reduction() ==
           variable_scope.VariableAggregation.MEAN)
     if scale_loss_by_num_towers:
-      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      num_towers = distribution_strategy_context.get_distribution_strategy(
+      ).num_towers
       if num_towers > 1:
         loss *= 1. / num_towers
 
@@ -862,7 +865,7 @@
     if not filtered:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
-    return distribute_lib.get_tower_context().merge_call(
+    return distribution_strategy_context.get_tower_context().merge_call(
         self._distributed_apply, filtered, global_step=global_step, name=name)
 
   def _get_or_create_state(self, var_list=None):
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
index 164ff0e..3de5340 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -22,7 +22,7 @@
 - divide gradient by the root of this average
 
 mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
-mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square)
 delta = - mom
 
 This implementation of RMSProp uses plain momentum, not Nesterov momentum.
@@ -33,7 +33,7 @@
 mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
 mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
 mom = momentum * mom{t-1} + learning_rate * g_t /
-    sqrt(mean_square - mean_grad**2 + epsilon)
+    sqrt(mean_square - mean_grad**2)
 delta = - mom
 """
 
@@ -43,7 +43,6 @@
 
 from tensorflow.contrib.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 
 from tensorflow.python.training import training_ops
 
@@ -87,7 +86,8 @@
       decay: A float hyperparameter. Discounting factor for the history/coming
         gradient.
       momentum: A float hyperparameter.
-      epsilon: A float hyperparameter. Small value to avoid zero denominator.
+      epsilon: A float hyperparameter. Small value to initialize the average
+        square gradient variable and avoid zero denominator.
       use_locking: If True use locks for update operation.
       centered: If True, gradients are normalized by the estimated variance of
         the gradient; if False, by the uncentered second moment. Setting this to
@@ -106,10 +106,8 @@
 
   def _create_vars(self, var_list, state):
     for v in var_list:
-      if v.get_shape().is_fully_defined():
-        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
-      else:
-        init_rms = array_ops.ones_like(v)
+      init_rms = state.get_hyper(
+          "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
       state.create_slot_with_initializer(v, init_rms, v.get_shape(),
                                          v.dtype.base_dtype, "rms")
       if self._centered:
@@ -129,7 +127,9 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          # epsilon is now the rms initial value and is not added to the
+          # denominator anymore, hence calling the kernel op with epsilon=0.
+          0,
           grad,
           use_locking=self._use_locking).op
     else:
@@ -140,7 +140,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking).op
 
@@ -157,7 +157,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking)
     else:
@@ -168,7 +168,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking)
 
@@ -185,7 +185,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad.values,
           grad.indices,
           use_locking=self._use_locking)
@@ -197,7 +197,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad.values,
           grad.indices,
           use_locking=self._use_locking)
@@ -215,7 +215,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           indices,
           use_locking=self._use_locking)
@@ -227,7 +227,7 @@
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           indices,
           use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index dc23ef2..628d041 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -39,34 +39,34 @@
 
 _TEST_PARAM_VALUES = [
     # learning_rate, decay, momentum, epsilon, centered, use_resource
-    [0.5, 0.9, 0.0, 1e-3, True, False],
-    [0.5, 0.9, 0.0, 1e-3, False, False],
-    [0.5, 0.9, 0.0, 1e-3, True, True],
-    [0.5, 0.9, 0.0, 1e-3, False, True],
-    [0.1, 0.9, 0.0, 1e-3, True, False],
-    [0.5, 0.95, 0.0, 1e-3, False, False],
-    [0.5, 0.95, 0.0, 1e-5, True, False],
-    [0.5, 0.95, 0.9, 1e-5, True, False],
+    [0.5, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.9, 0.0, 1.0, False, False],
+    [0.5, 0.9, 0.0, 1.0, True, True],
+    [0.5, 0.9, 0.0, 1.0, False, True],
+    [0.1, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.95, 0.0, 1.0, False, False],
+    [0.5, 0.8, 0.0, 1e-3, True, False],
+    [0.5, 0.8, 0.9, 1e-3, True, False],
 ]
 
 
 class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
-                            epsilon, centered):
+                            centered):
     rms_t = rms * decay + (1 - decay) * g * g
-    denom_t = rms_t + epsilon
     if centered:
       mg_t = mg * decay + (1 - decay) * g
-      denom_t -= mg_t * mg_t
+      denom_t = rms_t - mg_t * mg_t
     else:
       mg_t = mg
+      denom_t = rms_t
     mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
     var_t = var - mom_t
     return var_t, mg_t, rms_t, mom_t
 
   def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
-                                   lr, decay, momentum, epsilon, centered):
+                                   lr, decay, momentum, centered):
     mg_t = copy.deepcopy(mg)
     rms_t = copy.deepcopy(rms)
     mom_t = copy.deepcopy(mom)
@@ -75,7 +75,7 @@
       gindex = gindexs[i]
       gvalue = gvalues[i]
       rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
-      denom_t = rms_t[gindex] + epsilon
+      denom_t = rms_t[gindex]
       if centered:
         mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
         denom_t -= mg_t[gindex] * mg_t[gindex]
@@ -129,8 +129,8 @@
 
       mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
       mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
@@ -144,10 +144,10 @@
 
         var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
             var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
-            decay, momentum, epsilon, centered)
+            decay, momentum, centered)
         var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
             var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
-            decay, momentum, epsilon, centered)
+            decay, momentum, centered)
 
         # Validate updated params
         if centered:
@@ -191,7 +191,7 @@
       loss = pred * pred
       sgd_op = rmsprop.RMSPropOptimizer(
           learning_rate=1.0,
-          decay=0.0,
+          decay=0.1,
           momentum=0.0,
           epsilon=1.0,
           centered=True).minimize(loss)
@@ -202,7 +202,7 @@
       sgd_op.run()
       # Validate updated params
       self.assertAllCloseAccordingToType(
-          [[-111, -138]], var0.eval(), atol=0.01)
+          [[-7/3.0, -4/3.0]], var0.eval(), atol=0.01)
 
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
@@ -251,8 +251,8 @@
 
       mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
       mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
@@ -266,10 +266,10 @@
 
         var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
             var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
-            learning_rate, decay, momentum, epsilon, centered)
+            learning_rate, decay, momentum, centered)
         var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
             var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
-            learning_rate, decay, momentum, epsilon, centered)
+            learning_rate, decay, momentum, centered)
 
         # Validate updated params
         if centered:
@@ -317,13 +317,13 @@
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
           ]), var1.eval())
       # Step 2: the root mean square accumulators contain the previous update.
       update.run()
@@ -335,17 +335,17 @@
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
           ]), var1.eval())
 
   @parameterized.parameters(_DATA_TYPES)
@@ -357,7 +357,7 @@
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
       opt = rmsprop.RMSPropOptimizer(
-          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1.0)
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
@@ -383,22 +383,22 @@
           np.array([0.90001, 0.90001]), rms1.eval())
       # Check the momentum accumulators
       self.assertAllCloseAccordingToType(
-          np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                    (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+          np.array([(0.1 * 2.0 / math.sqrt(0.901)),
+                    (0.1 * 2.0 / math.sqrt(0.901))]), mom0.eval())
       self.assertAllCloseAccordingToType(
-          np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                    (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+          np.array([(0.01 * 2.0 / math.sqrt(0.90001)),
+                    (0.01 * 2.0 / math.sqrt(0.90001))]), mom1.eval())
 
       # Check that the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
           ]), var1.eval())
 
       # Step 2: the root mean square accumulators contain the previous update.
@@ -410,38 +410,38 @@
           np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
-              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
           ]), mom0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
-              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
           ]), mom1.eval())
 
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)))
           ]), var0.eval())
 
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)))
           ]), var1.eval())
 
 
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 36e21af..72ea777 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -60,7 +60,7 @@
         ":base_predictor",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/saved_model:signature_constants",
     ],
 )
@@ -90,9 +90,7 @@
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:export",
-        "//tensorflow/python/estimator:export_output",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/saved_model:signature_constants",
     ],
 )
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 2336361..499fec4 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -244,7 +244,9 @@
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
     ],
 )
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index c2a8def..a458400 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -75,7 +75,7 @@
       self.assertGreater(max_value, 0.0)
       self.assertLess(max_value, 1.0)
 
-  def testVariablesNotParitioned_LastValue(self):
+  def testVariablesNotPartitioned_LastValue(self):
     # Variables added should not use a default partiioner since they are
     # scalar. There would be a tensorflow error thrown if the partitioner was
     # respected by the rewrite.
@@ -90,7 +90,7 @@
             is_training=True,
             vars_collection=_MIN_MAX_VARS)
 
-  def testVariablesNotParitioned_MovingAvg(self):
+  def testVariablesNotPartitioned_MovingAvg(self):
     # Variables added should not use a default partiioner since they are
     # scalar. There would be a tensorflow error thrown if the partitioner was
     # respected by the rewrite.
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index cb66fd1..2ddbd73 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -455,6 +455,24 @@
     return self._bias_add_op
 
 
+def _FollowedByFakeQuant(tensor):
+  """Returns True if the tensor is followed by a FakeQuant."""
+  fake_quant_ops = set([
+      'FakeQuantWithMinMaxVars', 'FakeQuantWithMinMaxArgs',
+      'FakeQuantWithMinMaxVarsPerChannel'
+  ])
+  pass_through_ops = set(['Reshape', 'Identity'])
+  consumers = tensor.consumers()
+  while consumers:
+    c = consumers.pop()
+    if c.type in fake_quant_ops:
+      return True
+    elif c.type in pass_through_ops:
+      for output in c.outputs:
+        consumers.extend(output.consumers())
+  return False
+
+
 def _InsertQuantOp(context,
                    name,
                    producer,
@@ -535,11 +553,7 @@
   # Prevent ops from being quantized multiple times. Bypass ops can sometimes
   # overlap between multiple matches, so we need to ensure that we don't
   # add duplicate FakeQuant operations.
-  fake_quant_ops = set([
-      'FakeQuantWithMinMaxVars',
-      'FakeQuantWithMinMaxArgs'
-  ])
-  if fake_quant_ops.intersection(set([c.type for c in inputs.consumers()])):
+  if _FollowedByFakeQuant(inputs):
     return
 
   if moving_avg:
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 2944f96..484493f 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -59,6 +59,10 @@
 
   if input_graph is None:
     input_graph = ops.get_default_graph()
+
+  # Add check to see if graph has training ops, if so provide error message and
+  # exit
+  _check_for_training_ops(input_graph)
   with input_graph.as_default():
     fold_batch_norms.FoldBatchNorms(
         input_graph,
@@ -78,6 +82,9 @@
 
   Variables added by the rewrite get added to the global variables collection.
 
+  This function must be invoked prior to insertion of gradient ops in a graph
+  as quantization should be modeled in both forward and backward passes.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -104,7 +111,6 @@
   # Currently the values below are hardcoded for mobilenetV1 on imagenet
   # Please use the experimental API if you need to tune these values.
   freeze_bn_delay = None
-
   _create_graph(
       input_graph=input_graph,
       is_training=True,
@@ -141,6 +147,9 @@
                                        scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  This function must be invoked prior to insertion of gradient ops in a graph
+  as quantization should be modeled in both forward and backward passes.
+
   Variables added by the rewrite get added to the global variables collection.
 
   This function has additional experimental options not (yet) available to
@@ -226,3 +235,45 @@
       activation_bits=activation_bits,
       quant_delay=quant_delay,
       scope=scope)
+
+
+def _check_for_training_ops(g):
+  """Check if training ops are present in the graph.
+
+  Args:
+   g: The tf.Graph on which the check for training ops needs to be
+   performed.
+
+  Raises:
+    ValueError: If a training op is seen in the graph;
+  """
+
+  # The list here is obtained
+  # from https://www.tensorflow.org/api_docs/cc/group/training-ops
+  training_ops = frozenset([
+      'ApplyAdagrad', 'ApplyAdagradDA', 'ApplyAdam', 'ApplyAddSign',
+      'ApplyCenteredRMSProp', 'ApplyFtrl', 'ApplyFtrlV2',
+      'ApplyGradientDescent', 'ApplyMomentum', 'ApplyPowerSign',
+      'ApplyProximalAdagrad', 'ApplyProximalGradientDescent', 'ApplyRMSProp',
+      'ResourceApplyAdadelta', 'ResourceApplyAdagrad', 'ResourceApplyAdagradDA',
+      'ResourceApplyAdam', 'ResourceApplyAddSign',
+      'ResourceApplyCenteredRMSProp', 'ResourceApplyFtrl',
+      'ResourceApplyFtrlV2', 'ResourceApplyGradientDescent',
+      'ResourceApplyMomentum', 'ResourceApplyPowerSign',
+      'ResourceApplyProximalAdagrad', 'ResourceApplyProximalGradientDescent',
+      'ResourceApplyRMSProp', 'ResourceSparseApplyAdadelta',
+      'ResourceSparseApplyAdagrad', 'ResourceSparseApplyAdagradDA',
+      'ResourceSparseApplyCenteredRMSProp', 'ResourceSparseApplyFtrl',
+      'ResourceSparseApplyFtrlV2', 'ResourceSparseApplyMomentum',
+      'ResourceSparseApplyProximalAdagrad',
+      'ResourceSparseApplyProximalGradientDescent',
+      'ResourceSparseApplyRMSProp', 'SparseApplyAdadelta', 'SparseApplyAdagrad',
+      'SparseApplyAdagradDA', 'SparseApplyCenteredRMSProp', 'SparseApplyFtrl',
+      'SparseApplyFtrlV2', 'SparseApplyMomentum', 'SparseApplyProximalAdagrad',
+      'SparseApplyProximalGradientDescent', 'SparseApplyRMSProp'
+  ])
+
+  op_types = set([op.type for op in g.get_operations()])
+  train_op_list = op_types.intersection(training_ops)
+  if train_op_list:
+    raise ValueError('Training op found in graph, exiting %s' % train_op_list)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 54faf58..e80d218 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -20,10 +20,12 @@
 
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
+from tensorflow.python import training
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
@@ -145,6 +147,19 @@
         self.assertTrue(('int64_val: %i' % quant_delay) in const_value)
     self.assertTrue(quant_delay_found)
 
+  def testTrainingOpsCheck(self):
+    self._RunTestOverTrainingRewrites(self._TestTrainingOpsCheck)
+
+  def _TestTrainingOpsCheck(self, rewrite_fn):
+    with ops.Graph().as_default():
+      output = self._ConvLayer()
+      output_scalar = math_ops.reduce_sum(output)
+      loss = math_ops.square(output_scalar - 1)
+      opt = training.gradient_descent.GradientDescentOptimizer(0.0001)
+      opt.minimize(loss)
+      with self.assertRaisesRegexp(ValueError, 'Training op found in graph'):
+        rewrite_fn()
+
   def testWeightBits(self):
     self._RunTestOverExperimentalRewrites(self._TestWeightBits)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 06ebcdf..212d902 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -471,6 +471,60 @@
       self.assertTrue(
           'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
 
+  def testSkipReshapeQuantization(self):
+    self._RunTestOverParameters(self._TestSkipReshapeQuantization)
+
+  def _TestSkipReshapeQuantization(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=nn_ops.relu6,
+          scope='test/test')
+
+      reshape = array_ops.reshape(
+          conv, (int(10), int(height / 2), int(width / 2), int(16)))
+
+      # Insert a fake quant node after the reshape. We will check that one isn't
+      # insert before.
+      array_ops.fake_quant_with_min_max_vars(reshape, -1, 1)
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that there isn't a FakeQuant added before the reshape.
+      self.assertFalse(
+          'FakeQuantWithMinMaxVars' in [i.op.type for i in reshape.op.inputs])
+
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=nn_ops.relu6,
+          scope='test/test')
+
+      reshape = array_ops.reshape(
+          conv, (int(10), int(height / 2), int(width / 2), int(16)))
+
+      # If no fake quant is added after the reshape, a FakeQuant should be added
+      # before the reshape.
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that there isn't a FakeQuant added before the reshape.
+      self.assertTrue(
+          'FakeQuantWithMinMaxVars' in [i.op.type for i in reshape.op.inputs])
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
index 0f19ac7..f23194a 100644
--- a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -61,10 +61,17 @@
     func, args = self._CELLDEFS[celldef_name]
     return func(*args)
 
-  def _CreateInputs(self):
-    inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE,
-                               FunctionalRnnTest._TOTAL_TIME,
-                               FunctionalRnnTest._INPUT_SIZE])
+  def _CreateInputs(self, time_major=False):
+    if time_major:
+      inputs = np.random.random([
+          FunctionalRnnTest._TOTAL_TIME, FunctionalRnnTest._BATCH_SIZE,
+          FunctionalRnnTest._INPUT_SIZE
+      ])
+    else:
+      inputs = np.random.random([
+          FunctionalRnnTest._BATCH_SIZE, FunctionalRnnTest._TOTAL_TIME,
+          FunctionalRnnTest._INPUT_SIZE
+      ])
     # Always leave one time slot empty, to check max_length behavior.
     sequence_length = np.random.randint(
         0, high=FunctionalRnnTest._TOTAL_TIME - 1,
@@ -72,15 +79,51 @@
         dtype=np.int)
     return (inputs, sequence_length)
 
-  def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs,
-                      tf_sequence_length, initial_state=None,
-                      time_major=None, scope=None):
-    tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs,
-                                            sequence_length=tf_sequence_length,
-                                            initial_state=initial_state,
-                                            dtype=dtypes.float32,
-                                            time_major=time_major,
-                                            scope=scope)
+  def _CreateSymmetricInputs(self):
+    # total time = batch size
+    inputs = np.zeros(
+        (FunctionalRnnTest._BATCH_SIZE, FunctionalRnnTest._BATCH_SIZE,
+         FunctionalRnnTest._INPUT_SIZE))
+    for i in range(FunctionalRnnTest._BATCH_SIZE):
+      for j in range(i, FunctionalRnnTest._BATCH_SIZE):
+        inputs[i][j] = np.random.random([FunctionalRnnTest._INPUT_SIZE])
+        inputs[j][i] = inputs[i][j]
+
+    # Always leave one time slot empty, to check max_length behavior.
+    sequence_length = np.random.randint(
+        0,
+        high=FunctionalRnnTest._BATCH_SIZE - 1,
+        size=FunctionalRnnTest._BATCH_SIZE,
+        dtype=np.int)
+    return (inputs, sequence_length)
+
+  def _CreateRnnGraph(self,
+                      create_rnn_computation_func,
+                      cell,
+                      tf_inputs,
+                      tf_sequence_length,
+                      is_bidirectional,
+                      initial_state=None,
+                      time_major=None,
+                      scope=None):
+    if is_bidirectional:
+      tf_result = create_rnn_computation_func(
+          cell_fw=cell,
+          cell_bw=cell,
+          inputs=tf_inputs,
+          sequence_length=tf_sequence_length,
+          dtype=dtypes.float32,
+          time_major=time_major,
+          scope=scope)
+    else:
+      tf_result = create_rnn_computation_func(
+          cell=cell,
+          inputs=tf_inputs,
+          sequence_length=tf_sequence_length,
+          initial_state=initial_state,
+          dtype=dtypes.float32,
+          time_major=time_major,
+          scope=scope)
     grad = gradients_impl.gradients(tf_result, variables.trainable_variables())
     return {'inference': tf_result, 'grad': grad}
 
@@ -102,15 +145,26 @@
         variable_cache[n] = v
 
   def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache,
-              is_dynamic):
+              is_dynamic, time_major=None, is_bidirectional=False):
     with ops.Graph().as_default() as graph:
       tf_inputs = array_ops.placeholder(
           dtypes.float32, shape=numpy_inputs.shape)
       tf_slen = array_ops.placeholder(dtypes.int32)
       feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen}
       cell = self._CreateCell(cell_name)
-      fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn
-      fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen)
+      if is_dynamic:
+        if is_bidirectional:
+          fn = rnn_lib.bidirectional_dynamic_rnn
+        else:
+          fn = rnn_lib.dynamic_rnn
+      else:
+        if is_bidirectional:
+          fn = functional_rnn.bidirectional_functional_rnn
+        else:
+          fn = functional_rnn.functional_rnn
+
+      fetches = self._CreateRnnGraph(
+          fn, cell, tf_inputs, tf_slen, is_bidirectional, time_major=time_major)
       with self.test_session(graph=graph) as sess:
         sess.run(variables.global_variables_initializer())
         # Note that cell.trainable_variables it not always set.
@@ -158,6 +212,78 @@
     self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
     self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
 
+  def testLstmWithTimeMajorInputs(self):
+    """Checks an LSTM against the reference implementation, with time_major."""
+    time_major = True
+    np_inputs, np_slen = self._CreateInputs(time_major=True)
+    var_cache = {}
+    args = [np_inputs, np_slen, 'lstm', var_cache]
+    _, func_rnn = self._RunRnn(*(args + [False]), time_major=time_major)
+    _, dyn_rnn = self._RunRnn(*(args + [True]), time_major=time_major)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testBidirectionalLstmWithTimeMajorInputs(self):
+    """Checks a bi-directional LSTM with time-major inputs."""
+    time_major = True
+    np_inputs, np_slen = self._CreateInputs(time_major)
+    var_cache = {}
+    args = [np_inputs, np_slen, 'lstm', var_cache]
+    _, func_rnn = self._RunRnn(
+        *(args + [False]), time_major=time_major, is_bidirectional=True)
+    _, dyn_rnn = self._RunRnn(
+        *(args + [True]), time_major=time_major, is_bidirectional=True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    # TODO(b/112170761): comment out this line after the bug is fixed.
+    # self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testBidirectionalLstm(self):
+    """Checks time-major and batch-major rnn produce consistent results."""
+    time_major_inputs, np_slen = self._CreateInputs(True)
+    batch_major_inputs = np.transpose(time_major_inputs, [1, 0, 2])
+    var_cache = {}
+    args = [np_slen, 'lstm', var_cache, False]
+    _, time_major_rnn = self._RunRnn(
+        *([time_major_inputs] + args), time_major=True, is_bidirectional=True)
+    _, batch_major_rnn = self._RunRnn(
+        *([batch_major_inputs]+ args), time_major=False, is_bidirectional=True)
+    # Convert the batch-major outputs to be time-major before the comparasion.
+    outputs, state = batch_major_rnn['inference']
+    outputs = [np.transpose(x, [1, 0, 2]) for x in outputs]
+    batch_major_rnn['inference'] = [outputs, state]
+    self.assertAllClose(time_major_rnn['inference'],
+                        batch_major_rnn['inference'])
+    self.assertAllClose(time_major_rnn['grad'], batch_major_rnn['grad'])
+
+  def testBidirectionalLstmWithSymmetricInputs(self):
+    """Checks a bi-directional LSTM with symmetric inputs.
+
+    time-major and batch-major rnn produce the same result with symmetric
+    inputs.
+    """
+    np_inputs, np_slen = self._CreateSymmetricInputs()
+    var_cache = {}
+    args = [np_inputs, np_slen, 'lstm', var_cache]
+    _, time_major_func_rnn = self._RunRnn(
+        *(args + [False]), time_major=True, is_bidirectional=True)
+    _, batch_major_func_rnn = self._RunRnn(
+        *(args + [False]), time_major=False, is_bidirectional=True)
+    _, time_major_dyn_rnn = self._RunRnn(
+        *(args + [True]), time_major=True, is_bidirectional=True)
+    _, batch_major_dyn_rnn = self._RunRnn(
+        *(args + [True]), time_major=False, is_bidirectional=True)
+    self.assertAllClose(time_major_func_rnn['inference'],
+                        batch_major_func_rnn['inference'])
+    self.assertAllClose(time_major_func_rnn['grad'],
+                        batch_major_func_rnn['grad'])
+    self.assertAllClose(time_major_dyn_rnn['inference'],
+                        batch_major_dyn_rnn['inference'])
+    self.assertAllClose(time_major_dyn_rnn['grad'], batch_major_dyn_rnn['grad'])
+    self.assertAllClose(time_major_func_rnn['inference'],
+                        batch_major_dyn_rnn['inference'])
+    self.assertAllClose(time_major_func_rnn['grad'],
+                        batch_major_dyn_rnn['grad'])
+
 
 if __name__ == '__main__':
   test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
index a085474..67a8f59 100644
--- a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -206,7 +206,7 @@
     lengths = array_ops.tile(array_ops.reshape(sequence_length,
                                                [-1, 1]), [1, max_time])
     last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1),
-                             dtype=dtypes.float32)
+                             dtype=state_var.dtype)
     last_idx = array_ops.transpose(last_idx)
     last_idx_for_bcast = array_ops.expand_dims(last_idx, -1)
     sliced = math_ops.multiply(last_idx_for_bcast, state_var)
@@ -284,8 +284,13 @@
       inputs=inputs,
       cell_fn=func_cell.cell_step,
       use_tpu=use_tpu)
-  return _PostProcessOutput(extended_acc_state, extended_final_state,
-                            func_cell, inputs_flat[0].shape[0], sequence_length)
+  tf_output, tf_state = _PostProcessOutput(
+      extended_acc_state, extended_final_state, func_cell,
+      inputs_flat[0].shape[0], sequence_length)
+
+  if time_major:
+    tf_output = array_ops.transpose(tf_output, [1, 0, 2])
+  return tf_output, tf_state
 
 
 def bidirectional_functional_rnn(
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 2a846290..5874245 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -149,7 +149,7 @@
 
 cuda_py_tests(
     name = "core_rnn_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/core_rnn_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -175,7 +175,7 @@
 
 tf_py_test(
     name = "fused_rnn_cell_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/fused_rnn_cell_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -192,10 +192,6 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "manual",
-        "notap",
-    ],
 )
 
 cuda_py_tests(
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index cb437f2..026bf08 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """RNN Cells and additional RNN operations.
 
-See @{$python/contrib.rnn} guide.
+See [Contrib RNN](https://tensorflow.org/api_guides/python/contrib.rnn) guide.
 
 <!--From core-->
 @@RNNCell
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 1c20d88..d62ec45 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -1288,7 +1288,10 @@
   @test_util.run_in_graph_and_eager_modes
   def testDynamicEquivalentToStaticRNN(self):
     self._testDynamicEquivalentToStaticRNN(use_sequence_length=False)
-    self._testDynamicEquivalentToStaticRNN(use_sequence_length=False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicEquivalentToStaticRNNWithSequenceLength(self):
+    self._testDynamicEquivalentToStaticRNN(use_sequence_length=True)
 
 
 class BidirectionalRNNTest(test.TestCase):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 1816b46..f74c95f 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -3276,7 +3276,7 @@
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
 
-  For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
+  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
   that follows.
 
   TODO(gonnet): Write a paper describing this and add a reference here.
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index fbb50be..e7eb4ac 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -113,7 +113,6 @@
     size = "small",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":saved_model_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 1a1591d..18b56cd 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -177,7 +177,7 @@
 
 cuda_py_test(
     name = "beam_search_decoder_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/beam_search_decoder_test.py"],
     additional_deps = [
         ":seq2seq_py",
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index a7279bc..674f7cd 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -15,7 +15,9 @@
 
 """Ops for building neural network seq2seq decoders and losses.
 
-See the @{$python/contrib.seq2seq} guide.
+See the
+[Contrib Seq2seq](https://tensorflow.org/api_guides/python/contrib.seq2seq)
+guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 1c9d179..0ba32cd 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -382,8 +382,8 @@
         for values past the respective sequence lengths.
       scale: Python boolean.  Whether to scale the energy term.
       probability_fn: (optional) A `callable`.  Converts the score to
-        probabilities.  The default is @{tf.nn.softmax}. Other options include
-        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        probabilities.  The default is `tf.nn.softmax`. Other options include
+        `tf.contrib.seq2seq.hardmax` and `tf.contrib.sparsemax.sparsemax`.
         Its signature should be: `probabilities = probability_fn(score)`.
       score_mask_value: (optional) The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
@@ -529,8 +529,8 @@
         for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
       probability_fn: (optional) A `callable`.  Converts the score to
-        probabilities.  The default is @{tf.nn.softmax}. Other options include
-        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        probabilities.  The default is `tf.nn.softmax`. Other options include
+        `tf.contrib.seq2seq.hardmax` and `tf.contrib.sparsemax.sparsemax`.
         Its signature should be: `probabilities = probability_fn(score)`.
       score_mask_value: (optional): The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
@@ -1091,7 +1091,7 @@
     `AttentionWrapper`, then you must ensure that:
 
     - The encoder output has been tiled to `beam_width` via
-      @{tf.contrib.seq2seq.tile_batch} (NOT `tf.tile`).
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
     - The `batch_size` argument passed to the `zero_state` method of this
       wrapper is equal to `true_batch_size * beam_width`.
     - The initial state created with `zero_state` above contains a
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index f17dbb0..74741a7 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -234,7 +234,7 @@
     `AttentionWrapper`, then you must ensure that:
 
     - The encoder output has been tiled to `beam_width` via
-      @{tf.contrib.seq2seq.tile_batch} (NOT `tf.tile`).
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
     - The `batch_size` argument passed to the `zero_state` method of this
       wrapper is equal to `true_batch_size * beam_width`.
     - The initial state created with `zero_state` above contains a
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index 6a2080b..d088e74 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Signal processing operations.
 
-See the @{$python/contrib.signal} guide.
+See the
+[Contrib Signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
 
 @@frame
 @@hamming_window
diff --git a/tensorflow/contrib/signal/python/kernel_tests/test_util.py b/tensorflow/contrib/signal/python/kernel_tests/test_util.py
index 7d62895..b4422a4 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/test_util.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/test_util.py
@@ -27,15 +27,15 @@
   """Tries to optimize the provided graph using grappler.
 
   Args:
-    graph: A @{tf.Graph} instance containing the graph to optimize.
+    graph: A `tf.Graph` instance containing the graph to optimize.
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    rewriter_config: An optional @{tf.RewriterConfig} to use when rewriting the
+    rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the
       graph.
 
   Returns:
-    A @{tf.GraphDef} containing the rewritten graph.
+    A `tf.GraphDef` containing the rewritten graph.
   """
   if rewriter_config is None:
     rewriter_config = rewriter_config_pb2.RewriterConfig()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index 062d84a..ecc2fed 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -108,7 +108,7 @@
       # `M` has shape [frames, num_mel_bins]
       M = tf.matmul(S, A)
 
-  The matrix can be used with @{tf.tensordot} to convert an arbitrary rank
+  The matrix can be used with `tf.tensordot` to convert an arbitrary rank
   `Tensor` of linear-scale spectral bins into the mel scale.
 
       # S has shape [..., num_spectrogram_bins].
diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
index 653c030..4db8dc2 100644
--- a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+++ b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
@@ -90,22 +90,28 @@
       raise ValueError("frame_step must be an integer. Got %s" %
                        frame_step.dtype)
 
-    # If frame_length and frame_step are known at graph construction time, check
-    # frame_step is less than or equal to frame_length.
-    frame_step_static = tensor_util.constant_value(frame_step)
-    if (frame_step_static is not None and signal.shape.ndims is not None and
-        signal.shape[-1].value is not None and
-        frame_step_static > signal.shape[-1].value):
-      raise ValueError(
-          "frame_step (%d) must be less than or equal to frame_length (%d)" % (
-              frame_step_static, signal.shape[-1].value))
-
     signal_shape = array_ops.shape(signal)
 
     # All dimensions that are not part of the overlap-and-add. Can be empty for
     # rank 2 inputs.
     outer_dimensions = signal_shape[:-2]
 
+    # If frame_length and frame_step are known at graph construction time, check
+    # frame_step is less than or equal to frame_length.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.ndims is not None and
+        signal.shape[-1].value is not None):
+      if frame_step_static > signal.shape[-1].value:
+        raise ValueError(
+            "frame_step (%d) must be less than or equal to "
+            "frame_length (%d)" % (
+                frame_step_static, signal.shape[-1].value))
+      # If frame_length is equal to frame_step, there's no overlap so just
+      # reshape the tensor.
+      if frame_step_static == signal.shape[-1].value:
+        return array_ops.reshape(signal, array_ops.concat(
+            [outer_dimensions, [-1]], 0))
+
     signal_rank = array_ops.rank(signal)
     frames = signal_shape[-2]
     frame_length = signal_shape[-1]
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 5cfd5ee..0feb392 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -22,7 +22,8 @@
 **********************
 
 In the simplest use case, we use a model to create the predictions, then specify
-the metrics and finally call the `evaluation` method:
+the metrics and choose one model checkpoint, finally call the`evaluation_once`
+method:
 
   # Create model and obtain the predictions:
   images, labels = LoadData(...)
@@ -34,20 +35,24 @@
       "mse": slim.metrics.mean_squared_error(predictions, labels),
   })
 
+  checkpoint_path = '/tmp/my_model_dir/my_checkpoint'
+  log_dir = '/tmp/my_model_eval/'
+
   initial_op = tf.group(
       tf.global_variables_initializer(),
       tf.local_variables_initializer())
 
-  with tf.Session() as sess:
-    metric_values = slim.evaluation(
-        sess,
-        num_evals=1,
-        initial_op=initial_op,
-        eval_op=names_to_updates.values(),
-        final_op=name_to_values.values())
+  metric_values = slim.evaluate_once(
+      master='',
+      checkpoint_path=checkpoint_path,
+      log_dir=log_dir,
+      num_evals=1,
+      initial_op=initial_op,
+      eval_op=names_to_updates.values(),
+      final_op=name_to_values.values())
 
-    for metric, value in zip(names_to_values.keys(), metric_values):
-      logging.info('Metric %s has value: %f', metric, value)
+  for metric, value in zip(names_to_values.keys(), metric_values):
+    logging.info('Metric %s has value: %f', metric, value)
 
 ************************************************
 * Evaluating a Checkpointed Model with Metrics *
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 0b8fc0c..412a2c8 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,8 +31,5 @@
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "no_windows",
-        "notap",  # TODO(b/80546574): test is flaky
-    ],
+    tags = ["notap"],  # TODO(b/80546574): test is flaky
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index d22b80a..42898e7 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -17,7 +17,7 @@
 The operations in this package are safe to use with eager execution turned on or
 off. It has a more flexible API that allows summaries to be written directly
 from ops to places other than event log files, rather than propagating protos
-from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
+from `tf.summary.merge_all` to `tf.summary.FileWriter`.
 
 To use with eager execution enabled, write your code as follows:
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 164f3e5..22d6e49 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -515,6 +515,7 @@
     srcs_version = "PY2AND3",
     deps = [
         ":client_lib",
+        "//tensorflow/contrib/estimator:head",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 35e8c92..db970de 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,14 +18,16 @@
 from __future__ import print_function
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.estimator.python.estimator import head as core_head_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
-
+from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.estimator.export.export_output import PredictOutput
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -34,12 +36,12 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
-
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
@@ -48,6 +50,11 @@
 EPSILON = 0.000001
 
 
+class ModelBuilderOutputType(object):
+  MODEL_FN_OPS = 0
+  ESTIMATOR_SPEC = 1
+
+
 class TensorForestRunOpAtEndHook(session_run_hook.SessionRunHook):
 
   def __init__(self, op_dict):
@@ -106,20 +113,40 @@
       run_context.request_stop()
 
 
-def get_default_head(params, weights_name, name=None):
-  if params.regression:
-    return head_lib.regression_head(
-        weight_column_name=weights_name,
-        label_dimension=params.num_outputs,
-        enable_centered_bias=False,
-        head_name=name)
+def _get_default_head(params, weights_name, output_type, name=None):
+  """Creates a default head based on a type of a problem."""
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if params.regression:
+      return head_lib.regression_head(
+          weight_column_name=weights_name,
+          label_dimension=params.num_outputs,
+          enable_centered_bias=False,
+          head_name=name)
+    else:
+      return head_lib.multi_class_head(
+          params.num_classes,
+          weight_column_name=weights_name,
+          enable_centered_bias=False,
+          head_name=name)
   else:
-    return head_lib.multi_class_head(
-        params.num_classes,
-        weight_column_name=weights_name,
-        enable_centered_bias=False,
-        head_name=name)
-
+    if params.regression:
+      return core_head_lib.regression_head(
+          weight_column=weights_name,
+          label_dimension=params.num_outputs,
+          name=name,
+          loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+    else:
+      if params.num_classes == 2:
+        return core_head_lib.binary_classification_head(
+            weight_column=weights_name,
+            name=name,
+            loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+      else:
+        return core_head_lib.multi_class_head(
+            n_classes=params.num_classes,
+            weight_column=weights_name,
+            name=name,
+            loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
 
 def get_model_fn(params,
                  graph_builder_class,
@@ -135,19 +162,27 @@
                  report_feature_importances=False,
                  local_eval=False,
                  head_scope=None,
-                 include_all_in_serving=False):
+                 include_all_in_serving=False,
+                 output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
-    model_head = get_default_head(params, weights_name)
+    model_head = _get_default_head(params, weights_name, output_type)
 
   def _model_fn(features, labels, mode):
     """Function that returns predictions, training loss, and training op."""
+
     if (isinstance(features, ops.Tensor) or
         isinstance(features, sparse_tensor.SparseTensor)):
       features = {'features': features}
     if feature_columns:
       features = features.copy()
-      features.update(layers.transform_features(features, feature_columns))
+
+      if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+        features.update(layers.transform_features(features, feature_columns))
+      else:
+        for fc in feature_columns:
+          tensor = fc_core._transform_features(features, [fc])[fc]  # pylint: disable=protected-access
+          features[fc.name] = tensor
 
     weights = None
     if weights_name and weights_name in features:
@@ -201,52 +236,95 @@
     def _train_fn(unused_loss):
       return training_graph
 
-    model_ops = model_head.create_model_fn_ops(
-        features=features,
-        labels=labels,
-        mode=mode,
-        train_op_fn=_train_fn,
-        logits=logits,
-        scope=head_scope)
 
     # Ops are run in lexigraphical order of their keys. Run the resource
     # clean-up op last.
     all_handles = graph_builder.get_all_resource_handles()
     ops_at_end = {
-        '9: clean up resources': control_flow_ops.group(
-            *[resource_variable_ops.destroy_resource_op(handle)
-              for handle in all_handles])}
+        '9: clean up resources':
+            control_flow_ops.group(*[
+                resource_variable_ops.destroy_resource_op(handle)
+                for handle in all_handles
+            ])
+    }
 
     if report_feature_importances:
       ops_at_end['1: feature_importances'] = (
           graph_builder.feature_importances())
 
-    training_hooks.append(TensorForestRunOpAtEndHook(ops_at_end))
+    training_hooks = [TensorForestRunOpAtEndHook(ops_at_end)]
 
-    if early_stopping_rounds:
-      training_hooks.append(
-          TensorForestLossHook(
-              early_stopping_rounds,
-              early_stopping_loss_threshold=early_stopping_loss_threshold,
-              loss_op=model_ops.loss))
+    if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+      model_ops = model_head.create_model_fn_ops(
+          features=features,
+          labels=labels,
+          mode=mode,
+          train_op_fn=_train_fn,
+          logits=logits,
+          scope=head_scope)
 
-    model_ops.training_hooks.extend(training_hooks)
+      if early_stopping_rounds:
+        training_hooks.append(
+            TensorForestLossHook(
+                early_stopping_rounds,
+                early_stopping_loss_threshold=early_stopping_loss_threshold,
+                loss_op=model_ops.loss))
 
-    if keys is not None:
-      model_ops.predictions[keys_name] = keys
+      model_ops.training_hooks.extend(training_hooks)
 
-    if params.inference_tree_paths:
-      model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
+      if keys is not None:
+        model_ops.predictions[keys_name] = keys
 
-    model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-    if include_all_in_serving:
-      # In order to serve the variance we need to add the prediction dict
-      # to output_alternatives dict.
-      if not model_ops.output_alternatives:
-        model_ops.output_alternatives = {}
-      model_ops.output_alternatives[ALL_SERVING_KEY] = (
-          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
-    return model_ops
+      if params.inference_tree_paths:
+        model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
+
+      model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
+
+      if include_all_in_serving:
+        # In order to serve the variance we need to add the prediction dict
+        # to output_alternatives dict.
+        if not model_ops.output_alternatives:
+          model_ops.output_alternatives = {}
+        model_ops.output_alternatives[ALL_SERVING_KEY] = (
+            constants.ProblemType.UNSPECIFIED, model_ops.predictions)
+
+      return model_ops
+
+    else:
+      # Estimator spec
+      estimator_spec = model_head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_fn,
+          logits=logits)
+
+      if early_stopping_rounds:
+        training_hooks.append(
+            TensorForestLossHook(
+                early_stopping_rounds,
+                early_stopping_loss_threshold=early_stopping_loss_threshold,
+                loss_op=estimator_spec.loss))
+
+      estimator_spec = estimator_spec._replace(
+          training_hooks=training_hooks + list(estimator_spec.training_hooks))
+      if keys is not None:
+        estimator_spec.predictions[keys_name] = keys
+      if params.inference_tree_paths:
+        estimator_spec.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
+      estimator_spec.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
+
+      if include_all_in_serving:
+        outputs = estimator_spec.export_outputs
+        if not outputs:
+          outputs = {}
+        outputs = {ALL_SERVING_KEY: PredictOutput(estimator_spec.predictions)}
+        print(estimator_spec.export_outputs)
+        # In order to serve the variance we need to add the prediction dict
+        # to output_alternatives dict.
+        estimator_spec = estimator_spec._replace(export_outputs=outputs)
+
+      return estimator_spec
 
   return _model_fn
 
@@ -493,8 +571,11 @@
               params,
               graph_builder_class,
               device_assigner,
-              model_head=get_default_head(
-                  params, weight_column, name='head{0}'.format(i)),
+              model_head=_get_default_head(
+                  params,
+                  weight_column,
+                  name='head{0}'.format(i),
+                  output_type=ModelBuilderOutputType.MODEL_FN_OPS),
               weights_name=weight_column,
               keys_name=keys_column,
               early_stopping_rounds=early_stopping_rounds,
@@ -509,3 +590,142 @@
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+
+class CoreTensorForestEstimator(core_estimator.Estimator):
+  """A CORE estimator that can train and evaluate a random forest.
+
+  Example:
+
+  ```python
+  params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
+      num_classes=2, num_features=40, num_trees=10, max_nodes=1000)
+
+  # Estimator using the default graph builder.
+  estimator = CoreTensorForestEstimator(params, model_dir=model_dir)
+
+  # Or estimator using TrainingLossForest as the graph builder.
+  estimator = CoreTensorForestEstimator(
+      params, graph_builder_class=tensor_forest.TrainingLossForest,
+      model_dir=model_dir)
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    ...
+  def input_fn_eval: # returns x, y
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+
+  # Predict returns an iterable of dicts.
+  results = list(estimator.predict(x=x))
+  prob0 = results[0][eval_metrics.INFERENCE_PROB_NAME]
+  prediction0 = results[0][eval_metrics.INFERENCE_PRED_NAME]
+  ```
+  """
+
+  def __init__(self,
+               params,
+               device_assigner=None,
+               model_dir=None,
+               feature_columns=None,
+               graph_builder_class=tensor_forest.RandomForestGraphs,
+               config=None,
+               weight_column=None,
+               keys_column=None,
+               feature_engineering_fn=None,
+               early_stopping_rounds=100,
+               early_stopping_loss_threshold=0.001,
+               num_trainers=1,
+               trainer_id=0,
+               report_feature_importances=False,
+               local_eval=False,
+               version=None,
+               head=None,
+               include_all_in_serving=False):
+    """Initializes a TensorForestEstimator instance.
+
+    Args:
+      params: ForestHParams object that holds random forest hyperparameters.
+        These parameters will be passed into `model_fn`.
+      device_assigner: An `object` instance that controls how trees get
+        assigned to devices. If `None`, will use
+        `tensor_forest.RandomForestDeviceAssigner`.
+      model_dir: Directory to save model parameters, graph, etc. To continue
+        training a previously saved model, load checkpoints saved to this
+        directory into an estimator.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `_FeatureColumn`.
+      graph_builder_class: An `object` instance that defines how TF graphs for
+        random forest training and inference are built. By default will use
+        `tensor_forest.RandomForestGraphs`. Can be overridden by version
+        kwarg.
+      config: `RunConfig` object to configure the runtime settings.
+      weight_column: A string defining feature column name representing
+        weights. Will be multiplied by the loss of the example. Used to
+        downweight or boost examples during training.
+      keys_column: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      early_stopping_rounds: Allows training to terminate early if the forest is
+        no longer growing. 100 by default.  Set to a Falsy value to disable
+        the default training hook.
+      early_stopping_loss_threshold: Percentage (as fraction) that loss must
+        improve by within early_stopping_rounds steps, otherwise training will
+        terminate.
+      num_trainers: Number of training jobs, which will partition trees
+        among them.
+      trainer_id: Which trainer this instance is.
+      report_feature_importances: If True, print out feature importances
+        during evaluation.
+      local_eval: If True, don't use a device assigner for eval. This is to
+        support some common setups where eval is done on a single machine, even
+        though training might be distributed.
+      version: Unused.
+      head: A heads_lib.Head object that calculates losses and such. If None,
+        one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
+
+    Returns:
+      A `TensorForestEstimator` instance.
+    """
+
+    super(CoreTensorForestEstimator, self).__init__(
+        model_fn=get_model_fn(
+            params.fill(),
+            graph_builder_class,
+            device_assigner,
+            feature_columns=feature_columns,
+            model_head=head,
+            weights_name=weight_column,
+            keys_name=keys_column,
+            early_stopping_rounds=early_stopping_rounds,
+            early_stopping_loss_threshold=early_stopping_loss_threshold,
+            num_trainers=num_trainers,
+            trainer_id=trainer_id,
+            report_feature_importances=report_feature_importances,
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+            output_type=ModelBuilderOutputType.ESTIMATOR_SPEC),
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest_test.py b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
index ac42364..aa0016b 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
@@ -23,7 +23,39 @@
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.tensor_forest.client import random_forest
 from tensorflow.contrib.tensor_forest.python import tensor_forest
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
+
+
+def _get_classification_input_fns():
+  iris = base.load_iris()
+  data = iris.data.astype(np.float32)
+  labels = iris.target.astype(np.int32)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=data, y=labels, batch_size=150, num_epochs=None, shuffle=False)
+
+  predict_input_fn = numpy_io.numpy_input_fn(
+      x=data[:1,], y=None, batch_size=1, num_epochs=1, shuffle=False)
+  return train_input_fn, predict_input_fn
+
+
+def _get_regression_input_fns():
+  boston = base.load_boston()
+  data = boston.data.astype(np.float32)
+  labels = boston.target.astype(np.int32)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=data, y=labels, batch_size=506, num_epochs=None, shuffle=False)
+
+  predict_input_fn = numpy_io.numpy_input_fn(
+      x=data[:1,], y=None, batch_size=1, num_epochs=1, shuffle=False)
+  return train_input_fn, predict_input_fn
 
 
 class TensorForestTrainerTests(test.TestCase):
@@ -39,18 +71,22 @@
         inference_tree_paths=True)
     classifier = random_forest.TensorForestEstimator(hparams.fill())
 
-    iris = base.load_iris()
-    data = iris.data.astype(np.float32)
-    labels = iris.target.astype(np.int32)
+    input_fn, predict_input_fn = _get_classification_input_fns()
+    classifier.fit(input_fn=input_fn, steps=100)
+    res = classifier.evaluate(input_fn=input_fn, steps=10)
 
-    classifier.fit(x=data, y=labels, steps=100, batch_size=50)
-    classifier.evaluate(x=data, y=labels, steps=10)
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+    predictions = list(classifier.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0.576117, 0.211942, 0.211942]],
+                        [pred['probabilities'] for pred in predictions])
 
   def testRegression(self):
-    """Tests multi-class classification using matrix data as input."""
+    """Tests regression using matrix data as input."""
 
     hparams = tensor_forest.ForestHParams(
-        num_trees=3,
+        num_trees=5,
         max_nodes=1000,
         num_classes=1,
         num_features=13,
@@ -59,12 +95,263 @@
 
     regressor = random_forest.TensorForestEstimator(hparams.fill())
 
-    boston = base.load_boston()
-    data = boston.data.astype(np.float32)
-    labels = boston.target.astype(np.int32)
+    input_fn, predict_input_fn = _get_regression_input_fns()
 
-    regressor.fit(x=data, y=labels, steps=100, batch_size=50)
-    regressor.evaluate(x=data, y=labels, steps=10)
+    regressor.fit(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose([24.], [pred['scores'] for pred in predictions], atol=1)
+
+  def testAdditionalOutputs(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=1,
+        max_nodes=100,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.TensorForestEstimator(
+        hparams.fill(), keys_column='keys', include_all_in_serving=True)
+
+    iris = base.load_iris()
+    data = iris.data.astype(np.float32)
+    labels = iris.target.astype(np.int32)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': data,
+            'keys': np.arange(len(iris.data)).reshape(150, 1)
+        },
+        y=labels,
+        batch_size=10,
+        num_epochs=1,
+        shuffle=False)
+
+    classifier.fit(input_fn=input_fn, steps=100)
+    predictions = list(classifier.predict(input_fn=input_fn))
+    # Check that there is a key column, tree paths and var.
+    for pred in predictions:
+      self.assertTrue('keys' in pred)
+      self.assertTrue('tree_paths' in pred)
+      self.assertTrue('prediction_variance' in pred)
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertLessEqual(
+        reader.get_tensor(ops.GraphKeys.GLOBAL_STEP), global_step)
+
+  def testEarlyStopping(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=100,
+        max_nodes=10000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.TensorForestEstimator(
+        hparams.fill(),
+        # Set a crazy threshold - 30% loss change.
+        early_stopping_loss_threshold=0.3,
+        early_stopping_rounds=2)
+
+    input_fn, _ = _get_classification_input_fns()
+    classifier.fit(input_fn=input_fn, steps=100)
+
+    # We stopped early.
+    self._assert_checkpoint(classifier.model_dir, global_step=5)
+
+
+class CoreTensorForestTests(test.TestCase):
+
+  def testTrainEvaluateInferDoesNotThrowErrorForClassifier(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(hparams.fill(), head=head_fn)
+
+    input_fn, predict_input_fn = _get_classification_input_fns()
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0.576117, 0.211942, 0.211942]],
+                        [pred['probabilities'] for pred in predictions])
+
+  def testRegression(self):
+    """Tests regression using matrix data as input."""
+    head_fn = head_lib._regression_head(
+        label_dimension=1,
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=5,
+        max_nodes=1000,
+        num_classes=1,
+        num_features=13,
+        regression=True,
+        split_after_samples=20)
+
+    regressor = random_forest.CoreTensorForestEstimator(
+        hparams.fill(), head=head_fn)
+
+    input_fn, predict_input_fn = _get_regression_input_fns()
+
+    regressor.train(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[24.]], [pred['predictions'] for pred in predictions], atol=1)
+
+  def testWithFeatureColumns(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(
+        hparams.fill(),
+        head=head_fn,
+        feature_columns=[core_feature_column.numeric_column('x')])
+
+    iris = base.load_iris()
+    data = {'x': iris.data.astype(np.float32)}
+    labels = iris.target.astype(np.int32)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x=data, y=labels, batch_size=150, num_epochs=None, shuffle=False)
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+  def testAutofillsClassificationHead(self):
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(hparams.fill())
+
+    input_fn, _ = _get_classification_input_fns()
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+  def testAutofillsRegressionHead(self):
+    hparams = tensor_forest.ForestHParams(
+        num_trees=5,
+        max_nodes=1000,
+        num_classes=1,
+        num_features=13,
+        regression=True,
+        split_after_samples=20)
+
+    regressor = random_forest.CoreTensorForestEstimator(hparams.fill())
+
+    input_fn, predict_input_fn = _get_regression_input_fns()
+
+    regressor.train(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[24.]], [pred['predictions'] for pred in predictions], atol=1)
+
+  def testAdditionalOutputs(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=1,
+        max_nodes=100,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.CoreTensorForestEstimator(
+        hparams.fill(), keys_column='keys', include_all_in_serving=True)
+
+    iris = base.load_iris()
+    data = iris.data.astype(np.float32)
+    labels = iris.target.astype(np.int32)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': data,
+            'keys': np.arange(len(iris.data)).reshape(150, 1)
+        },
+        y=labels,
+        batch_size=10,
+        num_epochs=1,
+        shuffle=False)
+
+    classifier.train(input_fn=input_fn, steps=100)
+    predictions = list(classifier.predict(input_fn=input_fn))
+    # Check that there is a key column, tree paths and var.
+    for pred in predictions:
+      self.assertTrue('keys' in pred)
+      self.assertTrue('tree_paths' in pred)
+      self.assertTrue('prediction_variance' in pred)
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertLessEqual(
+        reader.get_tensor(ops.GraphKeys.GLOBAL_STEP), global_step)
+
+  def testEarlyStopping(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(
+        hparams.fill(),
+        head=head_fn,
+        # Set a crazy threshold - 30% loss change.
+        early_stopping_loss_threshold=0.3,
+        early_stopping_rounds=2)
+
+    input_fn, _ = _get_classification_input_fns()
+    est.train(input_fn=input_fn, steps=100)
+    # We stopped early.
+    self._assert_checkpoint(est.model_dir, global_step=8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
index 6cb2c88..7716536 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
@@ -54,17 +54,24 @@
   CHECK(safe_strto32(test.feature_id().id().value(), &feature_num_))
       << "Invalid feature ID: [" << test.feature_id().id().value() << "]";
   threshold_ = test.threshold().float_value();
-  include_equals_ =
-      test.type() == decision_trees::InequalityTest::LESS_OR_EQUAL;
+  _test_type = test.type();
 }
 
 int32 InequalityDecisionNodeEvaluator::Decide(
     const std::unique_ptr<TensorDataSet>& dataset, int example) const {
   const float val = dataset->GetExampleValue(example, feature_num_);
-  if (val < threshold_ || (include_equals_ && val == threshold_)) {
-    return left_child_id_;
-  } else {
-    return right_child_id_;
+  switch (_test_type) {
+    case decision_trees::InequalityTest::LESS_OR_EQUAL:
+      return val <= threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::LESS_THAN:
+      return val < threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::GREATER_OR_EQUAL:
+      return val >= threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::GREATER_THAN:
+      return val > threshold_ ? left_child_id_ : right_child_id_;
+    default:
+      LOG(ERROR) << "Unknown split test type: " << _test_type;
+      return -1;
   }
 }
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 3db351c..6497787 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -55,9 +55,7 @@
  protected:
   int32 feature_num_;
   float threshold_;
-
-  // If decision is '<=' as opposed to '<'.
-  bool include_equals_;
+  ::tensorflow::decision_trees::InequalityTest_Type _test_type;
 };
 
 // Evaluator for splits with multiple weighted features.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index af5cf72..3db1335 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -60,6 +60,40 @@
   ASSERT_EQ(eval->Decide(dataset, 4), 1);
 }
 
+TEST(InequalityDecisionNodeEvaluatorTest, TestGreaterOrEqual) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::GREATER_OR_EQUAL);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 1);
+  ASSERT_EQ(eval->Decide(dataset, 3), 0);
+  ASSERT_EQ(eval->Decide(dataset, 4), 0);
+}
+
+TEST(InequalityDecisionNodeEvaluatorTest, TestStrictlyGreater) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::GREATER_THAN);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 1);
+  ASSERT_EQ(eval->Decide(dataset, 3), 1);
+  ASSERT_EQ(eval->Decide(dataset, 4), 0);
+}
+
 TEST(MatchingDecisionNodeEvaluatorTest, Basic) {
   MatchingValuesTest test;
   test.mutable_feature_id()->mutable_id()->set_value("0");
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
index 2de7973..11335d7 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -13,14 +13,15 @@
 limitations under the License.
 ==============================================================================*/
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
 
 #include <vector>
 
+#define EIGEN_USE_GPU
 #include "tensorflow/core/framework/op_kernel.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -80,5 +81,5 @@
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
 #endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 0e96c1f..c230919 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -94,7 +94,6 @@
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:export",
         "//tensorflow/python/feature_column",
     ],
 )
@@ -149,9 +148,6 @@
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:export",
-        "//tensorflow/python/estimator:head",
-        "//tensorflow/python/estimator:metric_keys",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index 63f5d35..5eb4dee 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -195,7 +195,7 @@
     self.train_helper(input_window_size=10,
                       loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
                       train_steps=300,
-                      max_loss=1.5,
+                      max_loss=2.5,
                       anomaly_distribution=None)
 
   def test_autoregression_normal_multiple_periods(self):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index d2484d0..32194e4 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -321,6 +321,14 @@
             feature_keys.TrainEvalFeatures.VALUES,
         ]))
 
+  def _evaluate_ops(self, features):
+    """Add ops for evaluation (aka filtering) to the graph."""
+    spec = super(OneShotPredictionHead, self)._evaluate_ops(features)
+    # No state is fed to OneShotPredictionHead, so we don't return it; it being
+    # a tuple can cause issues for downstream infrastructure.
+    del spec.eval_metric_ops[feature_keys.State.STATE_TUPLE]
+    return spec
+
   def _serving_ops(self, features):
     """Add ops for serving to the graph."""
     with variable_scope.variable_scope("model", use_resource=True):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 857e7c5..bda3b53 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -397,6 +397,8 @@
         input_pipeline.NumpyReader(train_features), shuffle_seed=2,
         num_threads=1, batch_size=16, window_size=16)
     estimator.train(input_fn=train_input_fn, steps=5)
+    result = estimator.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
     input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
     export_location = estimator.export_savedmodel(_new_temp_dir(),
                                                   input_receiver_fn)
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index f5d8529..56e451e 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -41,7 +41,6 @@
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
         "python/tpu/tpu_estimator.py",
-        "python/tpu/tpu_system_metadata.py",
         "python/tpu/util.py",
     ],
     srcs_version = "PY2AND3",
@@ -63,10 +62,7 @@
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:run_config",
-        "//tensorflow/python/estimator:util",
+        "//tensorflow/python/estimator:estimator_py",
         "@six_archive//:six",
     ],
 )
@@ -196,7 +192,7 @@
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
@@ -217,6 +213,7 @@
         "python/tpu/tpu_function.py",
         "python/tpu/tpu_optimizer.py",
         "python/tpu/tpu_sharding.py",
+        "python/tpu/tpu_system_metadata.py",
         "python/tpu/training_loop.py",
     ],
     srcs_version = "PY2AND3",
@@ -268,7 +265,6 @@
         ":datasets",
     ],
     grpc_enabled = True,
-    tags = ["no_windows"],
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index d0a37eb..537d94b 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -18,6 +18,10 @@
 @@cross_replica_sum
 @@infeed_dequeue
 @@infeed_dequeue_tuple
+@@infeed_enqueue
+@@infeed_enqueue_tuple
+@@outfeed_dequeue
+@@outfeed_dequeue_tuple
 @@outfeed_enqueue
 @@outfeed_enqueue_tuple
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 19f088f..d4ccb0f 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@
 
 from setuptools import setup
 
-_VERSION = '1.9.0'
+_VERSION = '1.10.0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index 1bf4996..aee0941 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.9.0"
+#define TPU_PROFILER_VERSION "1.10.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index ff893a7..a5e8277 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -54,7 +54,7 @@
 
 import numpy as np
 
-from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver
+from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver as tpu_cluster_resolver_lib
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
@@ -80,12 +80,54 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+
+_SESSIONS = {}
+
+
+def tpu_session(cluster_resolver):
+  """Construct or return a `tf.Session` connected to the given cluster."""
+  global _SESSIONS
+  master = cluster_resolver.master()
+  if master not in _SESSIONS:
+    cluster_spec = cluster_resolver.cluster_spec()
+    config = config_pb2.ConfigProto(isolate_session_state=True)
+    if cluster_spec:
+      config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+
+    graph = ops.Graph()
+    session = tf_session.Session(graph=graph, target=master, config=config)
+
+    with graph.as_default():
+      session.run(tpu.initialize_system())
+
+    _SESSIONS[master] = session
+  return _SESSIONS[master]
+
+
+def reset_tpu_sessions():
+  _SESSIONS.clear()
 
 
 # Work-around dependency cycle between DistributionStrategy and TPU lib.
-def TPUDistributionStrategy(*args, **kw):  # pylint: disable=invalid-name
+def TPUDistributionStrategy(tpu_cluster_resolver=None):  # pylint: disable=invalid-name
+  """Construct a TPUDistributionStrategy."""
   from tensorflow.contrib.distribute.python import tpu_strategy  # pylint: disable=g-import-not-at-top
-  return tpu_strategy.TPUStrategy(*args, **kw)
+  # TODO -- remove this when TPUStrategy API is consistent (b/112705069)
+  if tpu_cluster_resolver is None:
+    tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('')
+
+  args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__)
+  if len(args) == 3:
+    logging.info('Detected new TPUStrategy API.')
+    return tpu_strategy.TPUStrategy(tpu_cluster_resolver, steps_per_run=1)
+  else:
+    logging.info('Detected old TPUStrategy API.')
+    strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8)
+    strategy._tpu_cluster_resolver = tpu_cluster_resolver
+
+  return strategy
 
 
 class TPUEmbedding(embeddings.Embedding):
@@ -666,9 +708,10 @@
 
       # Clone our CPU model, running within the TPU device context.
       with TPURewriteContext(tpu_input_map):
-        # TODO(power): Replicate variables.
-        with ops.device('/device:TPU:0'):
-          self._cloned_model = models.clone_model(self.model)
+        with variable_scope.variable_scope('tpu_model_%s' % id(self.model)):
+          # TODO(power): Replicate variables.
+          with ops.device('/device:TPU:0'):
+            self._cloned_model = models.clone_model(self.model)
 
       # Create a copy of the optimizer for this graph.
       if isinstance(self.model.optimizer, keras_optimizers.TFOptimizer):
@@ -845,7 +888,7 @@
 class KerasTPUModel(models.Model):
   """TPU compatible Keras model wrapper."""
 
-  def __init__(self, cpu_model, tpu_name_or_address, strategy):
+  def __init__(self, cpu_model, strategy):
     super(models.Model, self).__init__(  # pylint: disable=bad-super-call
         inputs=cpu_model.inputs,
         outputs=cpu_model.outputs,
@@ -862,27 +905,14 @@
     self.train_function = None
     self._strategy = strategy
 
-    self._tpu_name_or_address = tpu_name_or_address
+    cluster_resolver = self._strategy._tpu_cluster_resolver
+    self._tpu_name_or_address = cluster_resolver.get_master()
     self._cpu_model = cpu_model
     self._tpu_model = None
     self._tpu_weights_initialized = False
-    self._graph = ops.Graph()
 
-    self._cluster_resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu_name_or_address)
-    master = self._cluster_resolver.master()
-    cluster_spec = self._cluster_resolver.cluster_spec()
-    self._session = tf_session.Session(
-        graph=self._graph,
-        target=master,
-        config=config_pb2.ConfigProto(isolate_session_state=True))
-
-    # TODO(saeta): Confirm the lines below work in ClusterSpec propagation env.
-    if cluster_spec:
-      self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
-
-    with self._graph.as_default():
-      self._session.run(tpu.initialize_system())
+    self._session = tpu_session(cluster_resolver)
+    self._graph = self._session.graph
 
     # If the input CPU model has already been compiled, compile our TPU model
     # immediately.
@@ -1137,7 +1167,7 @@
 
 
 @experimental
-def tpu_model(model, tpu_name_or_address=None, strategy=None):
+def tpu_model(model, strategy=None):
   """Copy `model` along with weights to the TPU.  Returns a TPU model.
 
   Usage:
@@ -1148,7 +1178,7 @@
 
   # If `num_cores_per_host` is greater than one, batch parallelism will be used
   # to run on multiple TPU cores.
-  strategy = keras_support.TPUDistributionStrategy(num_cores_per_host=8)
+  strategy = keras_support.TPUDistributionStrategy(tpu_cluster_resolver)
   model = keras_support.tpu_model(model, strategy)
   model.compile(
       optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
@@ -1158,10 +1188,6 @@
 
   Args:
     model: A `KerasTPUModel`.
-    tpu_name_or_address: A string that is either the name of the Cloud TPU,
-      the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the
-      Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will
-      examine the environment to determine a potential Cloud TPU to use.
     strategy: `TPUDistributionStrategy`.  The strategy to use for replicating
               model across multiple TPU cores.
 
@@ -1176,9 +1202,8 @@
   # TODO(xiejw): Validate TPU model. TPUModel only?
   # TODO(xiejw): Validate replicas. Full or 1. Shall we allow subset?
   # TODO(xiejw): Adds reduction option.
+
   if strategy is None:
-    strategy = TPUDistributionStrategy(num_cores_per_host=1)
-  return KerasTPUModel(
-      cpu_model=model,
-      tpu_name_or_address=tpu_name_or_address,
-      strategy=strategy)
+    strategy = TPUDistributionStrategy()
+
+  return KerasTPUModel(cpu_model=model, strategy=strategy)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7994c2c..7fa06d6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1015,6 +1015,19 @@
 ])
 
 
+def under_tpu_inference_context():
+  """Check if it is currently under `tpu.rewrite_for_inference()`."""
+  graph = ops.get_default_graph()
+
+  context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  while context:
+    if isinstance(context, _TPUInferenceContext):
+      return True
+    context = context.outer_context
+
+  return False
+
+
 class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
   """A `ControlFlowContext` for nodes inside a TPU inference computation.
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 2c05436..806ae1c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -232,11 +232,16 @@
     if tpu_system_metadata is not None:
       return tpu_system_metadata
 
+    cluster_def = None
+    if (self._config.session_config and
+        self._config.session_config.cluster_def.job):
+      cluster_def = self._config.session_config.cluster_def
+
     # pylint: disable=protected-access
     tpu_system_metadata = (
         tpu_system_metadata_lib._query_tpu_system_metadata(
             master,
-            run_config=self._config,
+            cluster_def=cluster_def,
             query_topology=self.model_parallelism_enabled))
 
     self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index c104b24..f221155 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -45,6 +45,7 @@
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util as estimator_util
@@ -204,6 +205,12 @@
       use_locking=True)
 
 
+def _extract_key_names(tensor_or_dict):
+  if isinstance(tensor_or_dict, dict):
+    return sorted(tensor_or_dict.keys())
+  return []
+
+
 class _SIGNAL(object):
   """Signal used to control the thread of infeed/outfeed.
 
@@ -711,8 +718,7 @@
       features, labels = inputs.features_and_labels()
       signals = inputs.signals()
 
-      inputs_structure_recorder.validate_and_record_structure(
-          features, labels, signals)
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
               features, labels, signals))
@@ -859,7 +865,7 @@
             signals = inputs.signals()
 
             inputs_structure_recorder.validate_and_record_structure(
-                features, labels, signals)
+                features, labels)
             flattened_inputs = (
                 inputs_structure_recorder.flatten_features_and_labels(
                     features, labels, signals))
@@ -901,17 +907,19 @@
   inputs returned by the `input_fn` can have one of the following forms:
   1. features
   2. (features, labels)
+  3. ((arbitrarily nested structure of features), labels)
 
   Internally, form 1 is reformed to `(features, None)` as features and labels
   are passed separately to underlying methods. For TPU training, TPUEstimator
   may expect multiple `features` and `labels` tuples one for each core.
 
   TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  `features` can be `Tensor` or dict of string name to `Tensor`,
-  and `labels` could be `None`, `Tensor`, or dict of string name to `Tensor`.
-  TPU infeed/outfeed library expects flattened tensor list. So, `features` and
-  `labels` need to be flattened, before infeed enqueue, and the structure of
-  them needs to be recorded, in order to restore them after infeed dequeue.
+  and `labels`).  `features` can be `Tensor`, dict of string name to `Tensor`,
+  or nested tuples and `labels` could be `None`, `Tensor`, or dict of string
+  name to `Tensor`. TPU infeed/outfeed library expects flattened tensor list.
+  So, `features` and `labels` need to be flattened, before infeed enqueue, and
+  the structure of them needs to be recorded, in order to restore them after
+  infeed dequeue.
   """
 
   class InputsStructureRecorder(object):
@@ -919,10 +927,7 @@
 
     def __init__(self, input_partition_dims=None):
       # Holds the structure of inputs
-      self._feature_names = []
-      self._label_names = []
-      self._has_labels = False
-      self._signals_helper = None
+      self._feature_structure = {}
       self._flattened_input_dims = None
 
       if input_partition_dims:
@@ -949,7 +954,7 @@
       return self._flattened_input_dims
 
     def has_labels(self):
-      return self._has_labels
+      return 'labels' in self._feature_structure
 
     def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
                             label_dims_names, label_names, has_labels):
@@ -977,35 +982,16 @@
 
       return flattened_input_dims
 
-    def validate_and_record_structure(self, features, labels, signals=None):
+    def validate_and_record_structure(self, features, labels):
       """Validates and records the structure of `features` and `labels`."""
-
-      def _extract_key_names(tensor_or_dict):
-        if tensor_or_dict is None:
-          return []
-        return sorted(tensor_or_dict.keys()) if isinstance(
-            tensor_or_dict, dict) else []
-
       # Extract structure.
       has_labels = labels is not None
       feature_names = _extract_key_names(features)
       label_names = _extract_key_names(labels)
 
-      if signals is not None and self._signals_helper is None:
-        # Record signals helper.
-        self._signals_helper = _SignalsHelper(signals)
-
-      if self._initialized:
-        # Verify the structure is same. The following should never happen.
-        assert feature_names == self._feature_names, 'feature keys mismatched'
-        assert label_names == self._label_names, 'label keys mismatched'
-        assert has_labels == self._has_labels, 'label presence mismatched'
-      else:
+      if not self._initialized:
         # Record structure.
         self._initialized = True
-        self._feature_names = feature_names
-        self._label_names = label_names
-        self._has_labels = has_labels
         if self._feature_dims is not None:
           feature_dims_names = _extract_key_names(self._feature_dims)
           if feature_dims_names != feature_names:
@@ -1027,24 +1013,12 @@
 
     def flatten_features_and_labels(self, features, labels, signals=None):
       """Flattens the `features` and `labels` to a single tensor list."""
-      flattened_inputs = []
-      if self._feature_names:
-        # We need a fixed ordering for enqueueing and dequeueing.
-        flattened_inputs.extend(
-            [features[name] for name in self._feature_names])
-      else:
-        flattened_inputs.append(features)
-
+      self._feature_structure['features'] = features
       if labels is not None:
-        if self._label_names:
-          # We need a fixed ordering for enqueueing and dequeueing.
-          flattened_inputs.extend([labels[name] for name in self._label_names])
-        else:
-          flattened_inputs.append(labels)
-
+        self._feature_structure['labels'] = labels
       if signals is not None:
-        flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals))
-      return flattened_inputs
+        self._feature_structure['signals'] = signals
+      return data_nest.flatten(self._feature_structure)
 
     def unflatten_features_and_labels(self, flattened_inputs):
       """Restores the flattened inputs to original features and labels form.
@@ -1061,49 +1035,13 @@
         ValueError: If the number of expected tensors from `flattened_inputs`
           mismatches the recorded structure.
       """
-      expected_num_features = (
-          len(self._feature_names) if self._feature_names else 1)
-      if self._has_labels:
-        expected_num_labels = (
-            len(self._label_names) if self._label_names else 1)
-      else:
-        expected_num_labels = 0
 
-      expected_num_signals = (
-          self._signals_helper.num_signals if self._signals_helper else 0)
-
-      expected_num_tensors = (
-          expected_num_features + expected_num_labels + expected_num_signals)
-
-      if expected_num_tensors != len(flattened_inputs):
-        raise ValueError(
-            'The number of flattened tensors mismatches expected num. '
-            'Expected {}, got {}'.format(expected_num_tensors,
-                                         len(flattened_inputs)))
-      if self._feature_names:
-        unflattened_features = dict(
-            zip(self._feature_names, flattened_inputs[:expected_num_features]))
-      else:
-        # Single tensor case
-        unflattened_features = flattened_inputs[0]
-
-      if expected_num_labels == 0:
-        unflattened_label = None
-      elif self._label_names:
-        label_list = flattened_inputs[
-            expected_num_features:expected_num_features + expected_num_labels]
-        unflattened_label = dict(zip(self._label_names, label_list))
-      else:
-        # Single tensor case.
-        unflattened_label = flattened_inputs[expected_num_features]
-
-      signals = None
-      if expected_num_signals != 0:
-        tensor_list_for_signals = flattened_inputs[
-            expected_num_features + expected_num_labels:]
-        signals = self._signals_helper.unflatten(tensor_list_for_signals)
-
-      return _Inputs(unflattened_features, unflattened_label, signals=signals)
+      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
+                                                      flattened_inputs)
+      return _Inputs(
+          unflattened_inputs['features'],
+          unflattened_inputs.get('labels'),
+          signals=unflattened_inputs.get('signals'))
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -1505,12 +1443,14 @@
               'The {} to the model returned by input_fn must have static shape.'
               ' Tensor: {}'.format(obj_name, obj))
       else:
-        for (key, tensor) in obj.items():
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                'The {} to the model returned by input_fn must have static '
-                'shape. Key: \'{}\', Tensor: {}'.format(
-                    obj_name, key, tensor))
+        for (key, value) in obj.items():
+          flattened_tensors = data_nest.flatten(value)
+          for tensor in flattened_tensors:
+            if not tensor.get_shape().is_fully_defined():
+              raise ValueError(
+                  'The {} to the model returned by input_fn must have static '
+                  'shape. Key: \'{}\', Tensor: {}'.format(
+                      obj_name, key, tensor))
 
     validate(features, 'features')
     if labels is not None:
@@ -3338,26 +3278,6 @@
     return padding_mask
 
 
-class _SignalsHelper(object):
-  """A general helper class to handle common signals manipulation."""
-
-  def __init__(self, signals):
-    self._signal_keys = []
-    for key in sorted(iter(signals.keys())):
-      self._signal_keys.append(key)
-
-  @property
-  def num_signals(self):
-    return len(self._signal_keys)
-
-  def unflatten(self, tensor_list):
-    return dict(zip(self._signal_keys, tensor_list))
-
-  @staticmethod
-  def as_tensor_list(signals):
-    return [signals[key] for key in sorted(iter(signals.keys()))]
-
-
 def _verify_cross_hosts_transfer_size(tensor_dict, message):
   total_size = 0
   tensor_structure = {}
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index 894f21d..ec682e5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -45,7 +45,7 @@
 ])
 
 
-def _query_tpu_system_metadata(master_address, run_config,
+def _query_tpu_system_metadata(master_address, cluster_def=None,
                                query_topology=False):
   """Automatically detects the TPU system metadata in the system."""
   tpu_core_count = 0
@@ -61,7 +61,8 @@
         with session_lib.Session(
             master_address,
             config=get_session_config_with_timeout(
-                _PINGING_MASTER_TIMEOUT_IN_MS, run_config)) as sess:
+                _PINGING_MASTER_TIMEOUT_IN_MS,
+                cluster_def)) as sess:
           devices = sess.list_devices()
           for device in devices:
             match = _TPU_DEVICE_REG.match(device.name)
@@ -105,7 +106,7 @@
           'TPU worker has some problems. Available devices: {}'.format(
               master_address, devices))
 
-    topology = _obtain_topology(master_address, run_config)
+    topology = _obtain_topology(master_address, cluster_def)
 
   metadata = _TPUSystemMetadata(
       num_cores=tpu_core_count,
@@ -127,14 +128,15 @@
   return metadata
 
 
-def _obtain_topology(master_address, run_config):
+def _obtain_topology(master_address, cluster_def):
+  """Obtains TPU fabric topology."""
   try:
     logging.info('Initializing TPU system (master: %s) to fetch topology '
                  'for model parallelism. This might take a while.',
                  master_address)
     with ops.Graph().as_default():
       session_config = get_session_config_with_timeout(
-          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config)
+          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
       with session_lib.Session(
           master_address, config=session_config) as sess:
         topology = sess.run(tpu.initialize_system())
@@ -146,11 +148,8 @@
             master_address))
 
 
-def get_session_config_with_timeout(timeout_in_secs, run_config):
-  cluster_def = None
-  if run_config.session_config and run_config.session_config.cluster_def.job:
-    cluster_def = run_config.session_config.cluster_def
-
+def get_session_config_with_timeout(timeout_in_secs, cluster_def):
+  """Returns a session given a timeout and a cluster configuration."""
   config = config_pb2.ConfigProto(
       operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
   return config
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 76927e6..ddf8365 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -61,7 +61,7 @@
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data",
-        "//tensorflow/python/estimator:inputs_queues",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -133,7 +133,7 @@
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
-        "//tensorflow/python/estimator:inputs_queues",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index edd71fb..3547e71 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Training and input utilities.
 
-See @{$python/contrib.training} guide.
+See
+[Contrib Training](https://tensorflow.org/api_guides/python/contrib.training)
+guide.
 
 @@batch_sequences_with_states
 @@NextQueuedSequenceBatch
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 39d75a0..53e4f23 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -988,14 +988,14 @@
     assert isinstance(sequences, dict)
     assert isinstance(context, dict)
     assert isinstance(states, dict)
-    self._name_to_index = dict(
-        (name, ix)
+    self._name_to_index = {
+        name: ix
         for (ix, name) in enumerate([
             "__length", "__total_length", "__next_key", "__sequence",
             "__sequence_count"
         ] + ["__sequence__%s" % k for k in sequences.keys()] + [
             "__context__%s" % k for k in context.keys()
-        ] + ["__state__%s" % k for k in states.keys()]))
+        ] + ["__state__%s" % k for k in states.keys()])}
     self._index_to_name = [
         name
         for (name, _) in sorted(
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
index a244493..f46d032 100644
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -156,7 +156,7 @@
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
+    `tf.data.Dataset.apply`.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index f72e0a3..c272a2a 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -484,7 +484,8 @@
           save_checkpoint_secs=600,
           save_summaries_steps=100,
           config=None,
-          max_wait_secs=7200):
+          max_wait_secs=7200,
+          run_metadata=None):
   """Runs the training loop.
 
   Args:
@@ -511,6 +512,7 @@
       become available. This should be kept relatively short to help detect
       incorrect code, but sometimes may need to be increased if the chief takes
       a while to start up.
+    run_metadata: A [`RunMetadata`] protocol buffer.
 
   Returns:
     the value of the loss function after training.
@@ -541,5 +543,5 @@
       max_wait_secs=max_wait_secs) as session:
     loss = None
     while not session.should_stop():
-      loss = session.run(train_op)
+      loss = session.run(train_op, run_metadata=run_metadata)
   return loss
diff --git a/tensorflow/contrib/util/__init__.py b/tensorflow/contrib/util/__init__.py
index 08741cf..338acef 100644
--- a/tensorflow/contrib/util/__init__.py
+++ b/tensorflow/contrib/util/__init__.py
@@ -15,7 +15,7 @@
 
 """Utilities for dealing with Tensors.
 
-See @{$python/contrib.util} guide.
+See [Contrib Util](https://tensorflow.org/api_guides/python/contrib.util) guide.
 
 @@constant_value
 @@make_tensor_proto
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1423c7f..64430a1 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -121,6 +121,7 @@
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
     "tf_additional_proto_hdrs",
+    "tf_additional_proto_compiler_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
@@ -128,6 +129,7 @@
     "tf_jspb_proto_library",
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
+    "tf_lib_proto_compiler_deps",
     "tf_nano_proto_library",
     "tf_platform_hdrs",
     "tf_platform_srcs",
@@ -149,6 +151,7 @@
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "mkl_deps",
 )
 
 exports_files(["ops/ops.pbtxt"])
@@ -612,6 +615,17 @@
     ],
 )
 
+cc_library(
+    name = "lib_proto_compiler",
+    hdrs = [
+        "platform/protobuf_compiler.h",
+    ] + tf_additional_proto_compiler_hdrs(),
+    copts = tf_copts(),
+    deps = tf_lib_proto_compiler_deps() + [
+        ":lib_proto_parsing",
+    ],
+)
+
 # This build rule (along with :lib_internal, :framework, and
 # :framework_internal) purposefully omits the definitions of many declared
 # symbols, which are included in //tensorflow:libtensorflow_framework.so. Using
@@ -735,7 +749,10 @@
         "util/reporter.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-lm"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":lib",
@@ -860,7 +877,6 @@
         "util/work_sharder.h",
     ] + select({
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "util/memmapped_file_system.h",
             "util/memmapped_file_system_writer.h",
@@ -2036,7 +2052,7 @@
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
+        "//tensorflow:android": [],
         "//conditions:default": [
             "-ldl",
             "-lpthread",
@@ -2125,7 +2141,6 @@
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }),
     deps = [
@@ -2150,7 +2165,6 @@
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }),
     deps = [
@@ -2182,7 +2196,6 @@
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }),
     deps = [
@@ -2486,7 +2499,6 @@
         ],
     ) + select({
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "util/memmapped_file_system.cc",
             "util/memmapped_file_system_writer.cc",
@@ -2495,13 +2507,13 @@
     hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     linkopts = select({
-        "//tensorflow:freebsd": [],
+        "//tensorflow:freebsd": ["-lm"],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lm",
-    ],
+        "//conditions:default": [
+            "-ldl",
+            "-lm",
+        ],
+    }),
     deps = [
         ":lib",
         ":lib_internal",
@@ -2516,12 +2528,7 @@
     ] + if_static(
         extra_deps = ["@protobuf_archive//:protobuf"],
         otherwise = ["@protobuf_archive//:protobuf_headers"],
-    ) + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ),
+    ) + mkl_deps(),
     alwayslink = 1,
 )
 
@@ -2802,12 +2809,7 @@
         ":protos_all_cc",
         "//third_party/eigen3",
         "//tensorflow/core/grappler:grappler_item",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ),
+    ] + mkl_deps(),
     alwayslink = 1,
 )
 
@@ -2847,12 +2849,7 @@
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//third_party/eigen3",
         "//tensorflow/core/kernels:required",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
+    ] + mkl_deps() + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
     alwayslink = 1,
 )
 
@@ -3145,7 +3142,10 @@
     testonly = 1,
     srcs = ["platform/test_main.cc"],
     copts = tf_copts(),
-    linkopts = ["-lm"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     visibility = ["//tensorflow:internal"],
     deps = [
         ":lib",
@@ -3856,11 +3856,7 @@
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+    ] + mkl_deps(),
 )
 
 tf_cc_test_gpu(
@@ -4581,6 +4577,8 @@
         # PNG data
         "lib/png/testdata/lena_gray.png",
         "lib/png/testdata/lena_rgba.png",
+        "lib/png/testdata/lena_palette.png",
+        "lib/png/testdata/lena_palette_trns.png",
         # JPEG data
         "lib/jpeg/testdata/jpeg_merge_test1.jpg",
         "lib/jpeg/testdata/jpeg_merge_test1_cmyk.jpg",
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index ae03a61..51812ca 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -59,8 +59,8 @@
     file_contents = PBTxtFromMultiline(file_contents);
 
     ApiDefs api_defs;
-    CHECK(tensorflow::protobuf::TextFormat::ParseFromString(file_contents,
-                                                            &api_defs))
+    QCHECK(tensorflow::protobuf::TextFormat::ParseFromString(file_contents,
+                                                             &api_defs))
         << "Failed to load " << file_path;
     CHECK_EQ(api_defs.op_size(), 1);
     (*name_to_api_def)[api_defs.op(0).graph_op_name()] = api_defs.op(0);
diff --git a/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
index 58262a3..37d1a9d 100644
--- a/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
@@ -27,5 +27,15 @@
 fill([2, 3], 9) ==> [[9, 9, 9]
                      [9, 9, 9]]
 ```
+
+`tf.fill` differs from `tf.constant` in a few ways:
+
+*   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+    Tensor values.
+*   `tf.fill` creates an Op in the computation graph that constructs the actual
+    Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+    the entire Tensor into the graph with a `Const` node.
+*   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+    based on other runtime Tensors, unlike `tf.constant`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
index 342a1f6..9f3f9b2 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -27,7 +27,7 @@
 
     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
 
-Whereas in @{tf.gather} `indices` defines slices into the first
+Whereas in `tf.gather` `indices` defines slices into the first
 dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
 first `N` dimensions of `params`, where `N = indices.shape[-1]`.
 
@@ -123,5 +123,7 @@
               [['a1', 'b1'], ['c1', 'd1']]]
     output = [['b0', 'b1'], ['d0', 'c1']]
 ```
+
+See also `tf.gather` and `tf.batch_gather`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index 162ef2b..c6104da 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -54,5 +54,7 @@
 Note that on CPU, if an out of bound index is found, an error is returned.
 On GPU, if an out of bound index is found, a 0 is stored in the
 corresponding output value.
+
+See also `tf.batch_gather` and `tf.gather_nd`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt
new file mode 100644
index 0000000..9d04a01
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "HostConst"
+  attr {
+    name: "value"
+    description: <<END
+Attr `value` is the tensor to return.
+END
+  }
+  visibility: SKIP
+  summary: "Returns a constant tensor on the host. Only for writing C++ tests."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
index e7bc5dd..40d7d37 100644
--- a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
@@ -1,6 +1,6 @@
 op {
   graph_op_name: "Igamma"
-  summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
+  summary: "Compute the lower regularized incomplete Gamma function `P(a, x)`."
   description: <<END
 The lower regularized incomplete Gamma function is defined as:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
new file mode 100644
index 0000000..4433693
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "MapDefun"
+  visibility: HIDDEN
+  in_arg {
+    name: "arguments"
+    description: <<END
+    A list of tensors whose types are Targuments, corresponding to the inputs the
+    function should be mapped over.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+    A list of output tensors whose types are output_types and whose dimensions 0
+    are the same as the dimensions 0 of the tensors in arguments, and whose
+    remaining dimensions correspond to those in output_shapes.
+END
+  }
+  attr {
+    name: "Targuments"
+    description: "A list of types."
+  }
+  attr {
+    name: "output_types"
+    description: "A list of types."
+  }
+  attr {
+    name: "output_shapes"
+    description: "A list of shapes."
+  }
+  summary: <<END
+  Maps a function on the list of tensors unpacked from inputs on dimension 0.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index 2b58969..d9c4d5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -63,7 +63,7 @@
 
     [1, 12, 3, 14, 14, 6, 7, 20]
 
-See @{tf.scatter_nd} for more details about how to make updates to
+See `tf.scatter_nd` for more details about how to make updates to
 slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
index 17b79ee..d724cfc 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -63,7 +63,7 @@
 
     [1, 11, 3, 10, 9, 6, 7, 12]
 
-See @{tf.scatter_nd} for more details about how to make updates to
+See `tf.scatter_nd` for more details about how to make updates to
 slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index ad1c527..0b5917d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -30,7 +30,7 @@
 Creates a new tensor by applying sparse `updates` to individual values or
 slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
-@{tf.gather_nd} operator which extracts values or slices from a given tensor.
+`tf.gather_nd` operator which extracts values or slices from a given tensor.
 
 If `indices` contains duplicates, then their updates are accumulated (summed).
 
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
index a9a7646..5929425 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
@@ -66,7 +66,7 @@
 
     [1, 13, 3, 14, 14, 6, 7, 20]
 
-See @{tf.scatter_nd} for more details about how to make updates to
+See `tf.scatter_nd` for more details about how to make updates to
 slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt
index 35116e5..fa15538 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt
@@ -61,6 +61,6 @@
 
     [1, 13, 3, 14, 14, 6, 7, 20]
 
-See @{tf.scatter_nd} for more details about how to make updates to slices.
+See `tf.scatter_nd` for more details about how to make updates to slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
index 99e5c49..67346f0 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
@@ -66,7 +66,7 @@
 
     [1, -9, 3, -6, -4, 6, 7, -4]
 
-See @{tf.scatter_nd} for more details about how to make updates to
+See `tf.scatter_nd` for more details about how to make updates to
 slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
index cb57c17..1a75e67 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
@@ -68,7 +68,7 @@
 
     [1, 11, 3, 10, 9, 6, 7, 12]
 
-See @{tf.scatter_nd} for more details about how to make updates to
+See `tf.scatter_nd` for more details about how to make updates to
 slices.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index 5e2912f..35f55fe 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index a7d85b3..70a07d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the mean along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 74fc598..b2e3eec 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index 4c4363e..7bac02e 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \prod_j data_j\\) where the product is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 583ab39..a73306a 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \sum_j data_j\\) where sum is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 866e04e..138a636 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,8 +21,9 @@
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index af4bc75..b8073d8 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -30,7 +30,8 @@
 Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 194bcea..945bbdc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,7 +23,8 @@
   description: <<END
 N is the size of the segment being reduced.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index 8b50292..ff328c8 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -32,7 +32,8 @@
 Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index dfd50bf..a68e146 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -21,8 +21,9 @@
   }
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index 3bc1657..aa5c1fc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -30,8 +30,9 @@
 Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 For example:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessIf.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessIf.pbtxt
new file mode 100644
index 0000000..c0a6ba1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessIf.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "StatelessIf"
+  in_arg { name: "cond"  description: "The predicate." }
+  in_arg {
+    name: "cond"
+    description: <<END
+      A Tensor. If the tensor is a scalar of non-boolean type, the
+      scalar is converted to a boolean according to the
+      following rule: if the scalar is a numerical value, non-zero means
+      `True` and zero means False; if the scalar is a string, non-empty
+      means `True` and empty means `False`. If the tensor is not a scalar,
+      being empty means False and being non-empty means True.
+
+      This should only be used when the if then/else body functions do not
+      have stateful ops.
+END
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "then_branch"
+    description: <<END
+      A function that takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+END
+  }
+  attr {
+    name: "else_branch"
+    description: <<END
+    A function that takes 'inputs' and returns a list of tensors, whose
+    types are the same as what then_branch returns.
+END
+  }
+  summary: "output = cond ? then_branch(input) : else_branch(input)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessWhile.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessWhile.pbtxt
new file mode 100644
index 0000000..87c0e09
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessWhile.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "StatelessWhile"
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "dtype in use." }
+  attr {
+    name: "cond"
+    description: <<END
+      A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+
+      This should only be used when the while condition and body functions
+      do not have stateful ops.
+END
+  }
+  attr {
+    name: "body"
+    description: <<END
+      A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+END
+  }
+  summary: "output = input; While (Cond(output)) { output = Body(output) }"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000..cc21ddc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "StringLength"
+  in_arg {
+    name: "input"
+    description: <<END
+The string for which to compute the length.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Integer tensor that has the same shape as `input`. The output contains the
+element-wise string lengths of `input`.
+END
+  }
+  summary: "String lengths of `input`."
+  description: <<END
+Computes the length of each string given in the input tensor.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt
new file mode 100644
index 0000000..82c913d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "UnsafeDiv"
+  summary: "Returns 0 if the denominator is zero."
+  description: "" 
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 4ca6780..907c6d2 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
index 55ea69b..37dd973 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
index 577ff53..efbc023 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 9aeabd0..a887495 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -16,8 +16,9 @@
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output[i] = sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterSub.pbtxt
new file mode 100644
index 0000000..f1a4ccc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessIf.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessIf.pbtxt
new file mode 100644
index 0000000..0298c48
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessIf.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "StatelessIf" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatelessWhile.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatelessWhile.pbtxt
new file mode 100644
index 0000000..c138a71
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatelessWhile.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "StatelessWhile" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000..01c02e1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringLength"
+  endpoint {
+    name: "strings.length"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt
new file mode 100644
index 0000000..56caabc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnsafeDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index dbb2e67..4440843 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -34,7 +34,7 @@
 
   virtual ~CollectiveRemoteAccessLocal() {}
 
-  void StartAbort(const Status& s);
+  void StartAbort(const Status& s) override;
 
   void RecvFromPeer(const string& peer_device, const string& peer_task,
                     bool peer_is_local, const string& key, Device* to_device,
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0695278..bf1d78e 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -602,7 +602,7 @@
 
   if (tracer) {
     TF_RETURN_IF_ERROR(tracer->Stop());
-    TF_RETURN_IF_ERROR(tracer->Collect(args.stats_collector));
+    TF_RETURN_IF_ERROR(tracer->Collect(run_state.collector.get()));
   }
 
   {
@@ -618,8 +618,8 @@
         &session_state_));
   }
 
-  if (args.stats_collector) {
-    args.stats_collector->Finalize();
+  if (run_state.collector) {
+    run_state.collector->Finalize();
   }
 
   // Build and return the cost model as instructed.
@@ -634,7 +634,7 @@
     }
 
     mutex_lock l(executor_lock_);
-    args.stats_collector->BuildCostModel(&cost_model_manager_, device_to_graph);
+    run_state.collector->BuildCostModel(&cost_model_manager_, device_to_graph);
 
     // annotate stats onto cost graph.
     CostGraphDef* cost_graph = run_metadata->mutable_cost_graph();
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 92307d7..cf1cd41 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -103,7 +103,6 @@
     return *this;                                                            \
   }
 
-DEFINE_SET_ATTR(StringPiece, string_attrs_);
 DEFINE_SET_ATTR(float, float_attrs_);
 DEFINE_SET_ATTR(int, int_attrs_);
 DEFINE_SET_ATTR(bool, bool_attrs_);
@@ -119,9 +118,6 @@
 
 void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
                                    bool include_those_in_node_def) const {
-  for (const auto& p : string_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
   for (const auto& p : int_attrs_) {
     SetInAttrValueMap(m, p.first, p.second);
   }
@@ -211,10 +207,6 @@
     // not been called.
     if (node_def_finalized_) return f;
   }
-  for (const auto& p : string_attrs_) {
-    CombineUnordered(
-        CacheKeyHelper(p.first, tensorflow::Fingerprint128(p.second)), &f);
-  }
   for (const auto& p : int_attrs_) {
     CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
                      &f);
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 929b1b8..fc50bed 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -131,7 +131,6 @@
     }
   }
 
-  AttrVec<StringPiece> string_attrs_;
   AttrVec<int> int_attrs_;
   AttrVec<float> float_attrs_;
   AttrVec<bool> bool_attrs_;
@@ -143,8 +142,6 @@
 };  // namespace tensorflow
 
 template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name, StringPiece&& value);
-template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name, int&& value);
 template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name, float&& value);
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 6ab2d1e..5bdd547 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/util/env_var.h"
 
@@ -46,6 +47,7 @@
           local_device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION,
           &func_lib_def_, {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
+      num_active_steps_(0),
       async_default_(async),
       env_(opts.env),
       use_send_tensor_rpc_(false) {
@@ -161,6 +163,13 @@
     server_.release();
   }
 
+  {
+    mutex_lock l(keep_alive_thread_shutdown_mu_);
+    shutting_down_ = true;
+    keep_alive_thread_cv_.notify_all();
+  }
+  keep_alive_thread_.reset();
+
   CloseRemoteContexts();
 #endif
 
@@ -194,6 +203,35 @@
   return Status::OK();
 }
 
+void EagerContext::StartStep() {
+  mutex_lock ml(metadata_mu_);
+  num_active_steps_++;
+  if (step_container_ == nullptr) {
+    step_container_.reset(
+        new ScopedStepContainer(0, [this](const string& name) {
+          for (Device* device : devices_) {
+            device->resource_manager()->Cleanup(name).IgnoreError();
+          }
+        }));
+  }
+}
+
+void EagerContext::EndStep() {
+  mutex_lock ml(metadata_mu_);
+  num_active_steps_--;
+  if (num_active_steps_ == 0) {
+    step_container_.reset();
+  }
+}
+
+ScopedStepContainer* EagerContext::StepContainer() {
+  if (num_active_steps_.load() == 0) {
+    return nullptr;
+  }
+  mutex_lock ml(metadata_mu_);
+  return step_container_.get();
+}
+
 Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
   if (remote_device_manager_ == nullptr) return Status::OK();
 #ifndef __ANDROID__
@@ -303,7 +341,9 @@
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DeviceMgr> remote_device_manager,
     const gtl::FlatMap<string, uint64>& remote_contexts, Rendezvous* r,
-    DeviceMgr* local_device_mgr) {
+    DeviceMgr* local_device_mgr, int keep_alive_secs) {
+  mutex_lock l(remote_state_mu_);
+
   if (!remote_contexts_.empty()) {
     CloseRemoteContexts();
   }
@@ -345,6 +385,54 @@
   InitDeviceMapAndAsync();
 
   ClearCaches();
+
+  keep_alive_secs_ = keep_alive_secs;
+
+  sleep_for_secs_ = std::max(1, keep_alive_secs_ / 2);
+
+  // Only schedule a single closure.
+  if (keep_alive_thread_ == nullptr) {
+    keep_alive_thread_.reset(
+        env_->StartThread({}, "EagerKeepAliveThread", [this]() {
+          while (true) {
+            {
+              {
+                mutex_lock l(keep_alive_thread_shutdown_mu_);
+                keep_alive_thread_cv_.wait_for(
+                    l, std::chrono::seconds(sleep_for_secs_));
+
+                if (shutting_down_) {
+                  return;
+                }
+              }
+              {
+                mutex_lock l(remote_state_mu_);
+                if (keep_alive_secs_ > 0) {
+                  {
+                    for (const auto& worker_and_context_id : remote_contexts_) {
+                      auto* client = remote_eager_workers_->GetClient(
+                          worker_and_context_id.first);
+
+                      eager::KeepAliveRequest* request =
+                          new eager::KeepAliveRequest;
+                      eager::KeepAliveResponse* response =
+                          new eager::KeepAliveResponse;
+
+                      request->set_context_id(worker_and_context_id.second);
+                      client->KeepAliveAsync(
+                          request, response,
+                          [request, response](const Status& s) {
+                            delete request;
+                            delete response;
+                          });
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }));
+  }
 }
 #endif
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index a0b612e..9835b19 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -134,8 +134,6 @@
 
   Rendezvous* GetRendezvous() { return rendezvous_; }
 
-  mutex* FunctionsMu() { return &functions_mu_; }
-
   const tensorflow::DeviceMgr* local_device_mgr() const {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
                                               : local_unowned_device_manager_;
@@ -153,6 +151,10 @@
   void SetShouldStoreMetadata(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
 
+  void StartStep();
+  void EndStep();
+  ScopedStepContainer* StepContainer();
+
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
 
 #ifndef __ANDROID__
@@ -176,7 +178,7 @@
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DeviceMgr> remote_device_manager,
       const gtl::FlatMap<string, uint64>& remote_contexts, Rendezvous* r,
-      DeviceMgr* local_device_mgr);
+      DeviceMgr* local_device_mgr, int keep_alive_secs);
 
   bool HasActiveRemoteContext(uint64 context_id) {
     return active_remote_contexts_.find(context_id) !=
@@ -186,7 +188,7 @@
 
   // If true, then tensors should be shipped across processes via the
   // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
-  // instead (which in-turn use WorkerService.RecvTensor RPCs.
+  // instead (which in-turn use WorkerService.RecvTensor RPCs).
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
 
  private:
@@ -204,6 +206,7 @@
   // Only one of the below is set.
   std::unique_ptr<DeviceMgr> local_device_manager_;
   DeviceMgr* local_unowned_device_manager_;
+  std::unique_ptr<DeviceMgr> remote_device_manager_;
 
   // Devices owned by device_manager
   std::vector<Device*> devices_;
@@ -236,6 +239,10 @@
   // EagerExecutor for async execution.
   EagerExecutor executor_;
 
+  // Information related to step containers.
+  std::atomic<int> num_active_steps_;
+  std::unique_ptr<ScopedStepContainer> step_container_ GUARDED_BY(metadata_mu_);
+
   // True if the default value for execution mode is async. Note that this value
   // can be overridden per thread based on `thread_local_async` overrides.
   const bool async_default_;
@@ -247,7 +254,6 @@
 
 #ifndef __ANDROID__
   void CloseRemoteContexts();
-  std::unique_ptr<DeviceMgr> remote_device_manager_;
 
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
@@ -255,10 +261,20 @@
   std::unique_ptr<ServerInterface> server_;
   std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
+  mutex remote_state_mu_;
+
   gtl::FlatMap<string, uint64> remote_contexts_;
   gtl::FlatSet<uint64> active_remote_contexts_;
   gtl::FlatMap<Device*, std::pair<eager::EagerClient*, uint64>>
       device_to_client_cache_;
+
+  int keep_alive_secs_ GUARDED_BY(remote_state_mu_);
+  std::atomic<int> sleep_for_secs_;
+
+  std::unique_ptr<Thread> keep_alive_thread_;
+  mutex keep_alive_thread_shutdown_mu_;
+  condition_variable keep_alive_thread_cv_;
+  bool shutting_down_ GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
 #endif
 
   bool use_send_tensor_rpc_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 3837405..46065f3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -148,6 +148,9 @@
       node_stats->set_op_end_rel_micros((now_nanos - pre_time_nanos) /
                                         EnvTime::kMicrosToNanos);
       node_stats->set_op_end_rel_nanos(now_nanos - pre_time_nanos);
+      node_stats->set_all_end_rel_micros((now_nanos - pre_time_nanos) /
+                                         EnvTime::kMicrosToNanos);
+      node_stats->set_all_end_rel_nanos(now_nanos - pre_time_nanos);
     }
     if (!status.ok()) {
       if (result_handle != nullptr) result_handle->Unref();
@@ -297,12 +300,6 @@
                 << device->name();
     }
     kernel = new KernelAndDevice(ctx->GetRendezvous());
-    // Knowledge of the implementation of Init (and in-turn
-    // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def
-    // will be accessed, so grab on to the lock.
-    // See WARNING comment in Execute (before kernel->Run) - would be nice to
-    // rework to avoid this subtlety.
-    tf_shared_lock l(*ctx->FunctionsMu());
     auto* flr = ctx->func_lib(device);
 
     if (flr == nullptr) {
@@ -643,22 +640,23 @@
     TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
     inputs[i] = *input_tensor;
   }
-  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
-  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def.
-  // But knowledge of the implementation
-  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
-  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
-  // This is quite subtle. Re-work things to make this better?  (Would it make
-  // sense for FunctionLibraryRuntime to ensure thread-safe access to
-  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
-  // for ops which are a part of functions.
+  //  TODO(apassos) figure out how to record stats for ops which are a part of
+  //  functions.
   // TODO(agarwal): change Run to take vector of handles ?
-  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+  ScopedStepContainer* container = ctx->StepContainer();
+  if (container == nullptr) {
+    TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+  } else {
+    TF_RETURN_IF_ERROR(kernel->Run(container, &inputs, &outputs, maybe_stats));
+  }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
     maybe_stats->set_op_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                        maybe_stats->all_start_micros());
     maybe_stats->set_op_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
+    maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
+                                        maybe_stats->all_start_micros());
+    maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
     mutex_lock ml(*ctx->MetadataMu());
     if (ctx->ShouldStoreMetadata()) {
       auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index dae5d19..3d61ff4 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -60,12 +60,22 @@
   return s;
 }
 
-Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
-                            std::vector<Tensor>* output_tensors,
+Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
+                            std::vector<Tensor>* outputs,
                             NodeExecStats* stats) {
-  gtl::InlinedVector<TensorValue, 4> inputs;
-  for (Tensor& t : *input_tensors) {
-    inputs.push_back(TensorValue(&t));
+  ScopedStepContainer step_container(0, [this](const string& name) {
+    device_->resource_manager()->Cleanup(name).IgnoreError();
+  });
+  return this->Run(&step_container, inputs, outputs, stats);
+}
+
+Status KernelAndDevice::Run(ScopedStepContainer* step_container,
+                            std::vector<Tensor>* inputs,
+                            std::vector<Tensor>* outputs,
+                            NodeExecStats* stats) {
+  gtl::InlinedVector<TensorValue, 4> input_vector;
+  for (Tensor& t : *inputs) {
+    input_vector.push_back(TensorValue(&t));
   }
 
   std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
@@ -77,7 +87,7 @@
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
-  params.inputs = &inputs;
+  params.inputs = &input_vector;
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
@@ -94,10 +104,7 @@
     params.runner = runner_;
   }
 
-  ScopedStepContainer step_container(0, [this](const string& name) {
-    device_->resource_manager()->Cleanup(name).IgnoreError();
-  });
-  params.step_container = &step_container;
+  params.step_container = step_container;
 
   OpKernelContext context(&params);
 
@@ -114,9 +121,9 @@
   }
   if (!context.status().ok()) return context.status();
 
-  output_tensors->clear();
+  outputs->clear();
   for (int i = 0; i < context.num_outputs(); ++i) {
-    output_tensors->push_back(Tensor(*context.mutable_output(i)));
+    outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
     for (const auto& allocator_pair : context.wrapped_allocators()) {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index c0b676b..0ef419c 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -49,13 +49,6 @@
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  //
-  // TODO(ashankar): Figure out thread-safety concerns around
-  // FunctionLibraryRuntime (in particular, how the underlying
-  // FunctionLibraryDefinition might be mutated by another thread as new
-  // functions are registered with it).  Conservatively, thread-safe usage of
-  // the FunctionLibraryRuntime is pushed on to the caller (see locking in
-  // c_api.cc).
   static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
                      std::function<void(std::function<void()>)>* runner,
                      KernelAndDevice* out);
@@ -70,6 +63,9 @@
   Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
              NodeExecStats* stats);
 
+  Status Run(ScopedStepContainer* step_container, std::vector<Tensor>* inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats);
+
   const OpKernel* kernel() const { return kernel_.get(); }
 
   Device* device() const { return device_; }
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index c2fac4c..63ed860 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -72,141 +72,58 @@
   return node->op_def().allows_uninitialized_input();
 }
 
-// Sets the timeline_label field of *node_stats, using data from *node.
-// Returns true iff the node is a transfer node.
-// TODO(tucker): merge with the DetailText function in session.cc
-// in a common location.
-bool SetTimelineLabel(const Node* node, NodeExecStatsWrapper* stats) {
-  bool is_transfer_node = false;
-  if (!stats) {
-    return is_transfer_node;
-  }
-  string memory;
-  for (auto& all : stats->stats()->memory()) {
-    int64 tot = all.total_bytes();
-    if (tot >= 0.1 * 1048576.0) {
-      int64 peak = all.peak_bytes();
-      if (peak > 0) {
-        memory =
-            strings::StrCat(memory, "[", all.allocator_name(),
-                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
-                                            peak / 1048576.0));
-      } else {
-        memory = strings::StrCat(memory, "[", all.allocator_name(),
-                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
-      }
-    }
-  }
-  const AttrSlice attrs = node->attrs();
-  string text;
-  if (IsSend(node)) {
-    string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string recv_device;
-    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
-    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
-                           "(", tensor_name, " @", recv_device);
-    is_transfer_node = true;
-  } else if (IsRecv(node)) {
-    string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string send_device;
-    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
-    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
-                           "(", tensor_name, " @", send_device);
-    is_transfer_node = true;
-  } else {
-    text =
-        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
-                        str_util::Join(node->requested_inputs(), ", "), ")");
-  }
-  stats->stats()->set_timeline_label(text);
-  return is_transfer_node;
-}
-
 // Helper routines for collecting step stats.
 namespace nodestats {
-inline int64 NowInUsec() { return Env::Default()->NowMicros(); }
 inline int64 NowInNsec() { return Env::Default()->NowNanos(); }
 
-void SetScheduled(NodeExecStatsWrapper* stats, int64 nanos) {
+void SetScheduled(NodeExecStatsWrapper* stats, int64 micros) {
   if (!stats) return;
-  stats->stats()->set_scheduled_micros(nanos / EnvTime::kMicrosToNanos);
-  stats->stats()->set_scheduled_nanos(nanos);
+  stats->SetScheduled(micros * EnvTime::kMicrosToNanos);
 }
 
 void SetAllStart(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  int64 now_nanos = NowInNsec();
-  stats->stats()->set_all_start_micros(now_nanos / EnvTime::kMicrosToNanos);
-  stats->stats()->set_all_start_nanos(now_nanos);
+  stats->RecordExecutorStarted();
 }
 
 void SetOpStart(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_op_start_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                              nt->all_start_micros());
-  nt->set_op_start_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordComputeStarted();
 }
 
 void SetOpEnd(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_op_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                            nt->all_start_micros());
-  nt->set_op_end_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordComputeEnded();
 }
 
 void SetAllEnd(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_all_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                             nt->all_start_micros());
-  nt->set_all_end_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordExecutorEnded();
 }
 
 void SetOutput(NodeExecStatsWrapper* stats, int slot, const Tensor* v) {
   if (!stats) return;
-  DCHECK(v);
-  NodeOutput* no = stats->stats()->add_output();
-  no->set_slot(slot);
-  v->FillDescription(no->mutable_tensor_description());
+  stats->SetOutput(slot, v);
 }
 
 void SetMemory(NodeExecStatsWrapper* stats, OpKernelContext* ctx) {
   if (!stats) return;
-
-  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
-    stats->AddAllocation(allocator_pair.first, allocator_pair.second);
-  }
-  auto* ms = stats->stats()->mutable_memory_stats();
-  ms->set_temp_memory_size(ctx->temp_memory_allocated());
-  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
-    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-  }
-  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
+  stats->SetMemory(ctx);
 }
 
 void SetReferencedTensors(NodeExecStatsWrapper* stats,
                           const TensorReferenceVector& tensors) {
   if (!stats) return;
-  // be careful not to increment the reference count on any tensor
-  // while recording the information
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    AllocationDescription* description =
-        stats->stats()->add_referenced_tensor();
-    tensors.at(i).FillDescription(description);
+  stats->SetReferencedTensors(tensors);
+}
+
+// Sets the timeline_label field of *stats, using data from *node.
+// Returns true iff the node is a transfer node.
+bool SetTimelineLabel(const Node* node, NodeExecStatsWrapper* stats) {
+  if (!stats) {
+    return false;
   }
+  return stats->SetTimelineLabel(node);
 }
 
 }  // namespace nodestats
@@ -1319,7 +1236,7 @@
   TensorStore* tensor_store_;
   // Step-local container.
   ScopedStepContainer* step_container_;
-  StepStatsCollector* stats_collector_;
+  StepStatsCollectorInterface* const stats_collector_;
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
   checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
@@ -1694,8 +1611,7 @@
     if (stats_collector_ && !tagged_node.is_dead) {
       // track allocations if and only if we are collecting statistics
       params.track_allocations = true;
-      stats = new NodeExecStatsWrapper;
-      stats->stats()->set_node_name(node->name());
+      stats = new NodeExecStatsWrapper(node->name());
       nodestats::SetScheduled(stats, scheduled_nsec);
       nodestats::SetAllStart(stats);
     }
@@ -2165,7 +2081,8 @@
                              NodeExecStatsWrapper* stats,
                              TaggedNodeReadyQueue* inline_ready) {
   nodestats::SetAllEnd(stats);
-  if (stats_collector_ != nullptr && !SetTimelineLabel(node, stats)) {
+  if (stats_collector_ != nullptr &&
+      !nodestats::SetTimelineLabel(node, stats)) {
     // Only record non-transfer nodes.
     // Transfers 'stats' ownership to 'stats_collector_'.
     stats_collector_->Save(impl_->params_.device->name(), stats);
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index cd01b43..a238a67 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -83,7 +83,7 @@
   struct Args {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
     CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 9c9eacb..c23b7d3 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -643,10 +643,9 @@
     for (const FunctionDef& fdef : new_graph.library().function()) {
       const string& func_name = fdef.signature().name();
 
-      if ((*optimized_flib)->Find(func_name)) {
+      if ((*optimized_flib)->Contains(func_name)) {
         VLOG(3) << "Replace function: name=" << func_name;
-        TF_RETURN_IF_ERROR((*optimized_flib)->RemoveFunction(func_name));
-        TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
+        TF_RETURN_IF_ERROR((*optimized_flib)->ReplaceFunction(func_name, fdef));
       } else {
         VLOG(3) << "Add new function: name=" << func_name;
         TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 94e10db..99bd43e 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -28,7 +28,7 @@
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
 
-#ifndef DO_NOT_USE_ML
+#ifndef INTEL_MKL_DNN_ONLY
 #include "i_malloc.h"
 #endif
 
@@ -98,7 +98,7 @@
     VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes;
     allocator_ = new BFCAllocator(new MklSubAllocator, max_mem_bytes,
                                   kAllowGrowth, kName);
-#ifndef DO_NOT_USE_ML
+#ifndef INTEL_MKL_DNN_ONLY
     // For redirecting all allocations from MKL to this allocator
     // From: http://software.intel.com/en-us/node/528565
     i_malloc = MallocHook;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 729312a..6dac4c3 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -145,12 +145,11 @@
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU" || device_type == "TPU_SYSTEM" ||
-      device_type == "TPU") {
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return Status::OK();
   }
-  if (device_type == "GPU") {
+  if (device_type == "GPU" || device_type == "TPU") {
     auto* dev_info = flr->device()->tensorflow_gpu_device_info();
     if (dev_info) {
       *device_context = dev_info->default_context;
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index af6880c..9c2510e 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -16,12 +16,16 @@
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -36,11 +40,89 @@
 };
 }  // namespace
 
-NodeExecStatsWrapper::NodeExecStatsWrapper()
-    : NodeExecStatsWrapper(new NodeExecStats) {}
+NodeExecStatsWrapper::NodeExecStatsWrapper(const string& node_name)
+    : NodeExecStatsWrapper(new NodeExecStats) {
+  stats_->set_node_name(node_name);
+}
 NodeExecStatsWrapper::NodeExecStatsWrapper(NodeExecStats* stats)
     : stats_(stats) {}
 
+void NodeExecStatsWrapper::SetOutput(int slot, const Tensor* v) {
+  DCHECK(v);
+  NodeOutput* no = stats_->add_output();
+  no->set_slot(slot);
+  v->FillDescription(no->mutable_tensor_description());
+}
+
+void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) {
+  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
+    AddAllocation(allocator_pair.first, allocator_pair.second);
+  }
+  auto* ms = stats_->mutable_memory_stats();
+  ms->set_temp_memory_size(ctx->temp_memory_allocated());
+  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
+  }
+  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
+}
+
+void NodeExecStatsWrapper::SetReferencedTensors(
+    const TensorReferenceVector& tensors) {
+  // be careful not to increment the reference count on any tensor
+  // while recording the information
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    AllocationDescription* description = stats_->add_referenced_tensor();
+    tensors.at(i).FillDescription(description);
+  }
+}
+
+// TODO(tucker): merge with the DetailText function in session.cc
+// in a common location.
+bool NodeExecStatsWrapper::SetTimelineLabel(const Node* node) {
+  bool is_transfer_node = false;
+  string memory;
+  for (auto& all : stats_->memory()) {
+    int64 tot = all.total_bytes();
+    if (tot >= 0.1 * 1048576.0) {
+      int64 peak = all.peak_bytes();
+      if (peak > 0) {
+        memory =
+            strings::StrCat(memory, "[", all.allocator_name(),
+                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
+                                            peak / 1048576.0));
+      } else {
+        memory = strings::StrCat(memory, "[", all.allocator_name(),
+                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
+      }
+    }
+  }
+  const AttrSlice attrs = node->attrs();
+  string text;
+  if (IsSend(node)) {
+    string tensor_name;
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
+    string recv_device;
+    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", recv_device);
+    is_transfer_node = true;
+  } else if (IsRecv(node)) {
+    string tensor_name;
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
+    string send_device;
+    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", send_device);
+    is_transfer_node = true;
+  } else {
+    text =
+        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
+                        str_util::Join(node->requested_inputs(), ", "), ")");
+  }
+  stats_->set_timeline_label(text);
+  return is_transfer_node;
+}
+
 void NodeExecStatsWrapper::AddAllocation(
     Allocator* allocator, TrackingAllocator* tracking_allocator) {
   AllocatorMemoryUsed* memory = stats_->add_memory();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 996dbb5..7206fbf 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -12,14 +12,16 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
 
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,42 +32,127 @@
 class AllocatorMemoryUsed;
 class CostModelManager;
 class Graph;
+class Node;
 class NodeExecStats;
+class OpKernelContext;
 class StepStats;
+class Tensor;
 class TrackingAllocator;
 
 // Wraps NodeExecStats and adds allocation to it.
 class NodeExecStatsWrapper {
  public:
-  NodeExecStatsWrapper();
+  NodeExecStatsWrapper(const string& node_name);
   // Owns 'stats'.
   NodeExecStatsWrapper(NodeExecStats* stats);
 
   // Destructor calls Finalize() to release the TrackingAllocators.
   ~NodeExecStatsWrapper() { Finalize(); }
 
-  NodeExecStats* stats() { return stats_.get(); }
+  // Records the absolute time in nanoseconds at which this node became
+  // runnable (i.e. was scheduled for execution).
+  void SetScheduled(int64 nanos) {
+    stats_->set_scheduled_micros(nanos / EnvTime::kMicrosToNanos);
+    stats_->set_scheduled_nanos(nanos);
+  }
 
-  // "Does not take ownership of the 'allocator'.
-  // Transfers ownership of the 'tracking_allocator' to *this."
-  void AddAllocation(Allocator* allocator,
-                     TrackingAllocator* tracking_allocator);
+  // Called immediately after this node starts being processed by the executor.
+  void RecordExecutorStarted() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    stats_->set_all_start_micros(now_nanos / EnvTime::kMicrosToNanos);
+    stats_->set_all_start_nanos(now_nanos);
+  }
+
+  // Called immediately before this node's `Compute()` or `ComputeAsync()`
+  // method is called.
+  void RecordComputeStarted() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_op_start_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                    stats_->all_start_micros());
+    stats_->set_op_start_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Called immediately after this node's `Compute()` method returned (or, for
+  // asynchronous operations, the callback passed to its `ComputeAsync()` method
+  // was called).
+  void RecordComputeEnded() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_op_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                  stats_->all_start_micros());
+    stats_->set_op_end_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Called immediately after this executor finishes processing this node.
+  void RecordExecutorEnded() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_all_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                   stats_->all_start_micros());
+    stats_->set_all_end_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Records information about the tensor produced by this node at the given
+  // output slot.
+  void SetOutput(int slot, const Tensor* v);
+
+  // Records information about the memory allocated during the execution of this
+  // node.
+  void SetMemory(OpKernelContext* ctx);
+
+  // Records information about the tensors that were accessed during the
+  // execution of this node.
+  void SetReferencedTensors(const TensorReferenceVector& tensors);
+
+  // Sets the timeline_label field of the wrapped NodeExecStats, using data
+  // from *node. Returns true iff the node is a transfer node.
+  bool SetTimelineLabel(const Node* node);
 
  private:
   friend class StepStatsCollector;
 
+  NodeExecStats* stats() { return stats_.get(); }
+
   // Populates stats_ and releases TrackingAllocator.
   void Finalize();
 
+  // Does not take ownership of the `allocator`.
+  // Takes ownership of `tracking_allocator`.
+  void AddAllocation(Allocator* allocator,
+                     TrackingAllocator* tracking_allocator);
+
   gtl::InlinedVector<std::pair<AllocatorMemoryUsed*, TrackingAllocator*>, 2>
       allocations_;
   std::unique_ptr<NodeExecStats> stats_;
 };
 
+// Statistics collection interface for individual node execution.
+//
+// See `StepStatsCollector` for a concrete implementation of this interface
+// that interfaces with the `Session` layer.
+class StepStatsCollectorInterface {
+ public:
+  virtual ~StepStatsCollectorInterface() {}
+
+  // Saves `stats` to the collector.
+  virtual void Save(const string& device, NodeExecStatsWrapper* stats) = 0;
+
+  // Generates a string reporting the currently used memory based
+  // on ResourceExhausted OOM `err` message.
+  // `err` message needs to contain device name and allocator name, e.g.:
+  // "ResourceExhaustedError: OOM when allocating tensor ...
+  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
+  virtual string ReportAllocsOnResourceExhausted(const string& err) = 0;
+};
+
 // StepStatsCollector manages the collection of a StepStats object.
 // The StepStats object holds multiple DeviceStats.
 // Each DeviceStats object holds multiple NodeExecStats.
-class StepStatsCollector {
+class StepStatsCollector : public StepStatsCollectorInterface {
  public:
   // Does not take ownership of `ss`.
   explicit StepStatsCollector(StepStats* ss);
@@ -80,14 +167,9 @@
   // Save saves nt to the DeviceStats object associated with device.
   // Should be called before Finalize.
   void Save(const string& device, NodeExecStats* nt);
-  void Save(const string& device, NodeExecStatsWrapper* stats);
+  void Save(const string& device, NodeExecStatsWrapper* stats) override;
 
-  // Generates a string reporting the currently used memory based
-  // on ResourceExhausted OOM `err` message.
-  // `err` message needs to contain device name and allocator name, E.g.:
-  // "ResourceExhaustedError: OOM when allocating tensor ...
-  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
-  string ReportAllocsOnResourceExhausted(const string& err);
+  string ReportAllocsOnResourceExhausted(const string& err) override;
 
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
@@ -112,4 +194,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index 550f193..cc5909d 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -17,8 +17,8 @@
 #error This file must only be included when building TensorFlow with SYCL support
 #endif
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
@@ -72,4 +72,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 916c872..b8af637 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -126,7 +126,9 @@
     do {
       context_id = random::New64();
     } while (contexts_.find(context_id) != contexts_.end());
-    contexts_.emplace(context_id, new ServerContext(std::move(ctx)));
+    contexts_.emplace(
+        context_id,
+        new ServerContext(std::move(ctx), request->keep_alive_secs(), env_));
   }
   response->set_context_id(context_id);
 
@@ -231,9 +233,11 @@
 
 Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
                                    KeepAliveResponse* response) {
-  // TODO(nareshmodi): Automated context_id cleaning is not implemented
-  return errors::Unimplemented(
-      "EagerServiceImpl::KeepAlive is not implemented.");
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  return Status::OK();
 }
 
 Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
@@ -304,12 +308,15 @@
     *server_context = nullptr;
     return errors::InvalidArgument(strings::Printf(
         "Unable to find a context_id matching the specified one "
-        "(%lld). Perhaps the worker was restarted?",
+        "(%lld). Perhaps the worker was restarted, or the context was GC'd?",
         context_id));
   }
 
   *server_context = iter->second;
   (*server_context)->Ref();
+
+  (*server_context)->RecordAccess();
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 718b4e2..2784c5d 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -38,8 +38,41 @@
 // over this (e.g. gRPC).
 class EagerServiceImpl {
  public:
-  explicit EagerServiceImpl(const WorkerEnv* env) : env_(env) {}
+  explicit EagerServiceImpl(const WorkerEnv* env) : env_(env) {
+    gc_thread_.reset(
+        env_->env->StartThread({}, "EagerServiceContextGC", [this]() {
+          while (true) {
+            {
+              mutex_lock l(gc_thread_shutdown_mu_);
+              gc_thread_cv_.wait_for(l, std::chrono::seconds(1));
+
+              if (shutting_down_) {
+                return;
+              }
+            }
+            {
+              mutex_lock l(contexts_mu_);
+              for (auto it = contexts_.begin(); it != contexts_.end();) {
+                if (it->second->IsStale()) {
+                  it->second->Unref();
+                  it = contexts_.erase(it);
+                } else {
+                  it++;
+                }
+              }
+            }
+          }
+        }));
+  }
   virtual ~EagerServiceImpl() {
+    {
+      mutex_lock l(gc_thread_shutdown_mu_);
+      shutting_down_ = true;
+      gc_thread_cv_.notify_all();
+    }
+    gc_thread_.reset();
+
+    mutex_lock l(contexts_mu_);
     for (auto& entry : contexts_) {
       entry.second->Unref();
     }
@@ -71,8 +104,13 @@
   // and the EagerContext).
   class ServerContext : public core::RefCounted {
    public:
-    explicit ServerContext(std::unique_ptr<tensorflow::EagerContext> ctx)
-        : ctx_(std::move(ctx)) {}
+    explicit ServerContext(std::unique_ptr<tensorflow::EagerContext> ctx,
+                           int64 destroy_after_secs, const WorkerEnv* env)
+        : ctx_(std::move(ctx)), env_(env) {
+      destroy_after_micros_ =
+          destroy_after_secs * tensorflow::EnvTime::kSecondsToMicros;
+      RecordAccess();
+    }
     ~ServerContext() {
       for (const auto& entry : tensors_) {
         entry.second->Unref();
@@ -122,6 +160,18 @@
       return Status::OK();
     }
 
+    void RecordAccess() {
+      mutex_lock l(last_accessed_mu_);
+      last_accessed_micros_ = env_->env->NowMicros();
+    }
+
+    bool IsStale() {
+      mutex_lock l(last_accessed_mu_);
+      return (destroy_after_micros_ > 0 &&
+              (env_->env->NowMicros() - last_accessed_micros_) >
+                  destroy_after_micros_);
+    }
+
    private:
     using RemoteTensorHandleMap =
         gtl::FlatMap<RemoteTensorHandleInternal, tensorflow::TensorHandle*,
@@ -131,8 +181,15 @@
     // The context for this execution.
     std::unique_ptr<tensorflow::EagerContext> ctx_;
 
+    // The state related to the context for this execution.
     mutex tensors_mu_;
     RemoteTensorHandleMap tensors_ GUARDED_BY(tensors_mu_);
+
+    const WorkerEnv* const env_;  // Not owned.
+
+    mutex last_accessed_mu_;
+    int64 last_accessed_micros_ GUARDED_BY(last_accessed_mu_);
+    int64 destroy_after_micros_;
   };
   // The returned ServerContext will need to be Unrefed.
   tensorflow::Status GetServerContext(uint64, ServerContext**);
@@ -145,6 +202,11 @@
   mutex contexts_mu_;
   std::unordered_map<uint64, ServerContext*> contexts_ GUARDED_BY(contexts_mu_);
 
+  std::unique_ptr<Thread> gc_thread_;
+  mutex gc_thread_shutdown_mu_;
+  condition_variable gc_thread_cv_;
+  bool shutting_down_ GUARDED_BY(gc_thread_shutdown_mu_) = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(EagerServiceImpl);
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index d1f2a6d..5c9b33b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -365,6 +365,47 @@
                                                &close_context_response));
 }
 
+TEST_F(EagerServiceImplTest, KeepAliveTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  request.set_rendezvous_id(random::New64());
+  request.set_keep_alive_secs(3);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  worker_env_.env->SleepForMicroseconds(5 *
+                                        tensorflow::EnvTime::kSecondsToMicros);
+
+  KeepAliveRequest keep_alive_request;
+  KeepAliveResponse keep_alive_response;
+
+  keep_alive_request.set_context_id(response.context_id());
+
+  Status status =
+      eager_service_impl.KeepAlive(&keep_alive_request, &keep_alive_response);
+
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_PRED_FORMAT2(::testing::IsSubstring, "Unable to find a context_id",
+                      status.error_message());
+
+  // Create a new context.
+  request.set_rendezvous_id(random::New64());
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  // The context should not be GC'd.
+  worker_env_.env->SleepForMicroseconds(1 *
+                                        tensorflow::EnvTime::kSecondsToMicros);
+
+  keep_alive_request.set_context_id(response.context_id());
+
+  TF_ASSERT_OK(
+      eager_service_impl.KeepAlive(&keep_alive_request, &keep_alive_response));
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 8a6903b..bcd46a4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -120,27 +120,8 @@
   master_env_.env = env_;
   worker_env_.env = env_;
 
-  SessionOptions sess_opts;
-  ConfigProto config = server_def_.default_session_config();
-  sess_opts.config = config;
-
-  // Configure shared devices between master and worker.
-  string name_prefix =
-      strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
-                      "/task:", server_def_.task_index());
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
-                                               &master_env_.local_devices));
-  worker_env_.local_devices = master_env_.local_devices;
-  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
-  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
-                                   ? new RpcRendezvousMgr(&worker_env_)
-                                   : rendezvous_mgr_func(&worker_env_);
-  string unused;
-  string default_worker_name;
-  if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
-                                        &default_worker_name, &unused)) {
-    return errors::Internal("Could not parse worker name.");
-  }
+  // Check parameters before DeviceFactory::AddDevices,
+  // otherwise if 'task_index=-1' the program will abort.
 
   // Look up the port that has been requested for this task in `server_def_`.
   int requested_port = -1;
@@ -167,6 +148,28 @@
                             "\" was not defined in cluster");
   }
 
+  SessionOptions sess_opts;
+  ConfigProto config = server_def_.default_session_config();
+  sess_opts.config = config;
+
+  // Configure shared devices between master and worker.
+  string name_prefix =
+      strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
+                      "/task:", server_def_.task_index());
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
+                                               &master_env_.local_devices));
+  worker_env_.local_devices = master_env_.local_devices;
+  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+                                   ? new RpcRendezvousMgr(&worker_env_)
+                                   : rendezvous_mgr_func(&worker_env_);
+  string unused;
+  string default_worker_name;
+  if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
+                                        &default_worker_name, &unused)) {
+    return errors::Internal("Could not parse worker name.");
+  }
+
   // N.B. The order of initialization here is intricate, because we
   // wish to allow `requested_port == 0` (for choosing any port,
   // mostly for testing). Therefore, the construction of the channel
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 62a9d57..f3c7189 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -74,18 +74,18 @@
 }  // namespace
 
 Status GraphDefBuilderWrapper::AddDataset(
-    const GraphDatasetBase* dataset,
+    const DatasetBase* dataset,
     const std::vector<std::pair<size_t, Node*>>& inputs,
     const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
     Node** output) {
-  const string& op_type_name = dataset->op_name();
+  const string& name = dataset->name();
   std::unique_ptr<const GraphDefBuilder::Options> opts(
       new GraphDefBuilder::Options(b_->opts()));
   // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
   // attributes defined. It will be nice to have a consistent pattern.
-  bool has_output_types_attr = HasAttr(op_type_name, "output_types");
-  bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
+  bool has_output_types_attr = HasAttr(name, "output_types");
+  bool has_output_shapes_attr = HasAttr(name, "output_shapes");
   if (has_output_shapes_attr) {
     opts.reset(new GraphDefBuilder::Options(
         opts->WithAttr("output_shapes", dataset->output_shapes())));
@@ -102,8 +102,7 @@
     return errors::Internal("AddDataset: Failed to build Options with error ",
                             opts->StatusToString());
   }
-  NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
-                           opts->op_registry());
+  NodeBuilder node_builder(opts->GetNameForOp(name), name, opts->op_registry());
   {
     size_t total_size = inputs.size() + list_inputs.size();
     auto inputs_iter = inputs.begin();
@@ -128,30 +127,28 @@
   }
   *output = opts->FinalizeBuilder(&node_builder);
   if (*output == nullptr) {
-    return errors::Internal("AddDataset: Failed to build ", op_type_name,
+    return errors::Internal("AddDataset: Failed to build ", name,
                             " op with error ", opts->StatusToString());
   }
   return Status::OK();
 }
 
-Status GraphDefBuilderWrapper::AddFunction(OpKernelContext* ctx,
-                                           const string& function_name) {
+Status GraphDefBuilderWrapper::AddFunction(
+    const FunctionLibraryDefinition& flib_def, const string& function_name) {
   if (b_->HasFunction(function_name)) {
-    LOG(INFO) << "Function with name " << function_name << "already exists in"
-              << " the graph. It will not be added again.";
+    VLOG(1) << "Function with name " << function_name << "already exists in"
+            << " the graph. It will not be added again.";
     return Status::OK();
   }
-  TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
-  const FunctionLibraryDefinition* flib_def =
-      ctx->function_library()->GetFunctionLibraryDefinition();
-  const FunctionDef* f_def = flib_def->Find(function_name);
+  TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(flib_def, function_name));
+  const FunctionDef* f_def = flib_def.Find(function_name);
   if (f_def == nullptr) {
     return errors::InvalidArgument("Unable to find FunctionDef for ",
                                    function_name, " in the registry.");
   }
   FunctionDefLibrary def;
   *def.add_function() = *f_def;
-  const string gradient_func = flib_def->FindGradient(function_name);
+  const string gradient_func = flib_def.FindGradient(function_name);
   if (!gradient_func.empty()) {
     GradientDef* g_def = def.add_gradient();
     g_def->set_function_name(function_name);
@@ -162,19 +159,19 @@
   // Recursively add functions in inputs of function_name.
   for (const NodeDef& node_def : f_def->node_def()) {
     const OpRegistrationData* op_reg_data = nullptr;
-    TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
+    TF_RETURN_IF_ERROR(flib_def.LookUp(node_def.op(), &op_reg_data));
     if (op_reg_data->is_function_op) {
-      TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
+      TF_RETURN_IF_ERROR(AddFunction(flib_def, op_reg_data->op_def.name()));
     }
     // Recursively add functions in attrs of this NodeDef.
     for (const auto& pair : node_def.attr()) {
-      TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, ctx));
+      TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, flib_def));
     }
   }
 
   // Recursively add functions in attrs of function_name.
   for (auto iter = f_def->attr().begin(); iter != f_def->attr().end(); iter++) {
-    TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, ctx));
+    TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, flib_def));
   }
   return Status::OK();
 }
@@ -186,27 +183,32 @@
       b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
 }
 
-bool GraphDefBuilderWrapper::HasAttr(const string& op_type_name,
+bool GraphDefBuilderWrapper::HasAttr(const string& name,
                                      const string& attr_name) const {
   const OpDef* op_def = nullptr;
-  Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
+  Status s = b_->opts().op_registry()->LookUpOpDef(name, &op_def);
   if (!s.ok() || op_def == nullptr) {
     return false;
   }
   return HasAttr(op_def, attr_name);
 }
 
-Status GraphDatasetBase::Serialize(OpKernelContext* ctx,
-                                   string* serialized_graph_def,
-                                   string* output_node) const {
+Status DatasetBase::Save(SerializationContext* ctx,
+                         IteratorStateWriter* writer) const {
+  string serialized_graph_def;
+  string output_node;
   GraphDefBuilder b;
   DatasetGraphDefBuilder db(&b);
   Node* node = nullptr;
   TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-  *output_node = node->name();
+  output_node = node->name();
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-  graph_def.SerializeToString(serialized_graph_def);
+  graph_def.SerializeToString(&serialized_graph_def);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
   return Status::OK();
 }
 
@@ -266,26 +268,55 @@
   MakeDataset(ctx, input, another_input, output);
 }
 
-const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
-const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
+const char DatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
+const char DatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
 
-namespace dataset {
-
-IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
-  params.lib = ctx->function_library();
-  // Note: must use reinterpret_cast because function.h forward-declares Device.
-  DeviceBase* device =
-      reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  return IteratorContext(params);
+BackgroundWorker::BackgroundWorker(Env* env, const string& name) {
+  thread_.reset(env->StartThread({} /* thread_options */, name,
+                                 [this]() { WorkerLoop(); }));
 }
 
-}  // namespace dataset
+BackgroundWorker::~BackgroundWorker() {
+  {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+  }
+  cond_var_.notify_one();
+  // Block until the background thread has terminated.
+  //
+  // NOTE(mrry): We explicitly free and join the thread here because
+  // `WorkerLoop()` uses other members of this object, and so we must join
+  // the thread before destroying them.
+  thread_.reset();
+}
+
+void BackgroundWorker::Schedule(std::function<void()> work_item) {
+  {
+    mutex_lock l(mu_);
+    work_queue_.push_back(std::move(work_item));
+  }
+  cond_var_.notify_one();
+}
+
+void BackgroundWorker::WorkerLoop() {
+  while (true) {
+    std::function<void()> work_item = nullptr;
+    {
+      mutex_lock l(mu_);
+      while (!cancelled_ && work_queue_.empty()) {
+        cond_var_.wait(l);
+      }
+      if (cancelled_) {
+        return;
+      }
+      DCHECK(!work_queue_.empty());
+      work_item = std::move(work_queue_.front());
+      work_queue_.pop_front();
+    }
+    DCHECK(work_item != nullptr);
+    work_item();
+  }
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 8cf84af..e0c26d9 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -15,6 +15,7 @@
 #ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
 #define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
 
+#include <deque>
 #include <memory>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -39,6 +40,8 @@
 
 namespace tensorflow {
 
+class DatasetBase;
+
 // Interface for reading values from a key-value store.
 // Used for restoring iterator state.
 class IteratorStateReader {
@@ -65,7 +68,6 @@
 // Forward declarations to avoid introducing a dependency on headers in
 // "tensorflow/core/graph/...".
 class GraphDefBuilder;
-class GraphDatasetBase;
 class Node;
 
 // Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
@@ -119,7 +121,7 @@
     return Status::OK();
   }
 
-  Status AddDataset(const GraphDatasetBase* dataset,
+  Status AddDataset(const DatasetBase* dataset,
                     const std::vector<Node*>& inputs, Node** output) {
     return AddDataset(dataset, inputs, {}, output);
   }
@@ -132,7 +134,7 @@
   // `*output` contains a pointer to the output `Node`. It is guaranteed to be
   // non-null if the method returns with an OK status.
   // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddDataset(const GraphDatasetBase* dataset,
+  Status AddDataset(const DatasetBase* dataset,
                     const std::vector<Node*>& inputs,
                     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
                     Node** output) {
@@ -144,7 +146,7 @@
   }
 
   Status AddDataset(
-      const GraphDatasetBase* dataset,
+      const DatasetBase* dataset,
       const std::vector<std::pair<size_t, Node*>>& inputs,
       const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
       const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
@@ -156,7 +158,8 @@
   // name `function_name` is not found in the FunctionLibraryDefinition, returns
   // an InvalidArgumentError. If the function with name `function_name` or any
   // of its dependent functions are stateful, returns an InvalidArgument error.
-  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+  Status AddFunction(const FunctionLibraryDefinition& flib_def,
+                     const string& function_name);
 
   template <typename T>
   void BuildAttrValue(const T& value, AttrValue* attr) {
@@ -166,18 +169,16 @@
  private:
   void AddTensorInternal(const Tensor& val, Node** output);
 
-  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+  Status EnsureFunctionIsStateless(const FunctionLibraryDefinition& flib_def,
                                    const string& function_name) const {
-    const FunctionLibraryDefinition* lib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* function_def = lib_def->Find(function_name);
+    const FunctionDef* function_def = flib_def.Find(function_name);
     if (!function_def) {
       return errors::InvalidArgument("Unable to find FunctionDef for ",
                                      function_name, " in registry.");
     }
     for (const NodeDef& node_def : function_def->node_def()) {
       const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      TF_RETURN_IF_ERROR(flib_def.LookUpOpDef(node_def.op(), &op_def));
       // TODO(b/65524810): Hack to allow functions to capture Dataset op
       // nodes needed for FlatMap. Currently, source datasets nodes have been
       // marked stateful to avoid constant folding since we do not have a
@@ -219,12 +220,13 @@
     return false;
   }
 
-  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+  Status AddAttrFunctions(const AttrValue& attr_value,
+                          const FunctionLibraryDefinition& flib_def) {
     if (attr_value.has_func()) {
-      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+      TF_RETURN_IF_ERROR(AddFunction(flib_def, attr_value.func().name()));
     } else if (attr_value.has_list()) {
       for (const NameAttrList& name_attr_list : attr_value.list().func()) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+        TF_RETURN_IF_ERROR(AddFunction(flib_def, name_attr_list.name()));
       }
     }
     return Status::OK();
@@ -235,21 +237,17 @@
 
 class StatsAggregator;
 
-// A cut-down version of OpKernelContext for running computations in
-// iterators. Note that we cannot simply use OpKernelContext here
-// because we might run computation in an iterator whose lifetime is
-// not nested within the lifetime of a single OpKernelContext
-// (e.g. asynchronous prefetching).
+// A cut-down version of `OpKernelContext` for running computations in
+// iterators. Note that we cannot simply use `OpKernelContext` here because we
+// might run computation in an iterator whose lifetime is not nested within the
+// lifetime of a single `OpKernelContext` (e.g. asynchronous prefetching).
 //
-// TODO(mrry): We will probably need to support more of
-// OpKernelContext here. For example, should allocation be handled by
-// the IteratorContext?
-// TODO(mrry): We're making some daring assumptions about the lifetime
-// of the runner passed in here. A runner will be deleted when the original
-// step ends, but all existing runners only close over session-lifetime (or
-// longer-lived) state, so we can make a copy of the function. There's nothing
-// in the definition of the API from which we took the runner to guarantee that
-// what we are doing is safe. We should formalize the properties here.
+// TODO(mrry): We're making some daring assumptions about the lifetime of the
+// runner passed in here. A runner will be deleted when the original step ends,
+// but all existing runners only close over session-lifetime (or longer-lived)
+// state, so we can make a copy of the function. There's nothing in the
+// definition of the API from which we took the runner to guarantee that what we
+// are doing is safe. We should formalize the properties here.
 class IteratorContext {
  public:
   struct Params {
@@ -279,6 +277,19 @@
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
+  explicit IteratorContext(OpKernelContext* ctx) {
+    params_.env = ctx->env();
+    params_.runner = *(ctx->runner());
+    params_.lib = ctx->function_library();
+    // NOTE: must use reinterpret_cast because function.h forward-declares
+    // Device.
+    DeviceBase* device =
+        reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+    params_.allocator_getter = [device](AllocatorAttributes attrs) {
+      return device->GetAllocator(attrs);
+    };
+  }
+
   Env* env() const { return params_.env; }
 
   std::function<void(std::function<void()>)>* runner() {
@@ -317,6 +328,23 @@
   Params params_;
 };
 
+// Aggregates runtime support needed for dataset and iterator serialization.
+class SerializationContext {
+ public:
+  struct Params {
+    const FunctionLibraryDefinition* flib_def;  // Not owned.
+  };
+
+  explicit SerializationContext(Params params) : params_(std::move(params)) {}
+
+  const FunctionLibraryDefinition& flib_def() { return *params_.flib_def; }
+
+ private:
+  Params params_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SerializationContext);
+};
+
 // Represents the current position in a range of outputs, where the
 // range of outputs is typically represented by an `DatasetBase`,
 // defined below.
@@ -341,6 +369,11 @@
   virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) = 0;
 
+  Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // iterator.
@@ -356,7 +389,7 @@
   virtual Status Initialize(IteratorContext* ctx) { return Status::OK(); }
 
   // Saves the state of this iterator.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+  virtual Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
     return SaveInternal(writer);
   }
 
@@ -367,19 +400,17 @@
 
  protected:
   // This is needed so that sub-classes of IteratorBase can call
-  // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDatasetOp::Dataset`.
-  Status SaveParent(IteratorStateWriter* writer,
-                    const std::unique_ptr<IteratorBase>& parent) {
-    return parent->SaveInternal(writer);
+  // `SaveInternal` on their input iterators.
+  Status SaveInput(IteratorStateWriter* writer,
+                   const std::unique_ptr<IteratorBase>& input) {
+    return input->SaveInternal(writer);
   }
 
   // This is needed so that sub-classes of IteratorBase can call
-  // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDatasetOp::Dataset`.
-  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
-                       const std::unique_ptr<IteratorBase>& parent) {
-    return parent->RestoreInternal(ctx, reader);
+  // `RestoreInternal` on their input iterators.
+  Status RestoreInput(IteratorContext* ctx, IteratorStateReader* reader,
+                      const std::unique_ptr<IteratorBase>& input) {
+    return input->RestoreInternal(ctx, reader);
   }
 
   // Saves the state of this iterator recursively.
@@ -394,10 +425,40 @@
   }
 };
 
+// Represents runtime information needed to construct a dataset.
+class DatasetContext {
+ public:
+  struct Params {
+    string name;
+  };
+
+  explicit DatasetContext(Params params) : params_(std::move(params)) {}
+
+  explicit DatasetContext(OpKernelContext* ctx) {
+    params_.name = ctx->op_kernel().type_string();
+  }
+
+  const string& name() const { return params_.name; }
+
+ private:
+  Params params_;
+};
+
 // Represents a (potentially infinite) range of outputs, where each
 // output is a tuple of tensors.
 class DatasetBase : public core::RefCounted {
  public:
+  // Key for storing the Dataset graph in the serialized format.
+  TF_EXPORT static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
+
+  explicit DatasetBase(DatasetContext&& ctx) : name_(ctx.name()) {}
+
+  const string& name() const { return name_; }
+
   // Returns a new iterator for iterating over the range of elements in
   // this dataset.
   //
@@ -414,6 +475,11 @@
     return (*iterator)->Initialize(ctx);
   }
 
+  Status MakeIterator(IteratorContext&& ctx, const string& prefix,
+                      std::unique_ptr<IteratorBase>* iterator) const {
+    return MakeIterator(&ctx, prefix, iterator);
+  }
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // dataset.
@@ -428,76 +494,34 @@
   virtual string DebugString() const = 0;
 
   // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
-    return errors::Unimplemented("DatasetBase::Save");
-  }
+  virtual Status Save(SerializationContext* ctx,
+                      IteratorStateWriter* writer) const;
 
  protected:
-  // TODO(srbs): Ideally all graph related logic should reside in
-  // GraphDatasetBase. However, that would require Datasets defined in all ops
-  // to derive from GraphDatasetBase. Once that is done we can move
-  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
   class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
    public:
     DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
-                            Node** output) {
+    Status AddInputDataset(SerializationContext* ctx,
+                           const DatasetBase* dataset, Node** output) {
       return dataset->AsGraphDefInternal(ctx, this, output);
     }
   };
 
-  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+  // TODO(jsimsa): Consolidate overloading into a single method.
+  virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return AsGraphDefInternal(b, node);
-  }
-
-  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return errors::Unimplemented("AsGraphDefInternal");
-  }
+                                    Node** node) const = 0;
 
   virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const = 0;
 
   friend class DatasetToGraphOp;  // For access to graph related members.
-};
-
-// Base-class for datasets that are built by ops.
-class GraphDatasetBase : public DatasetBase {
- public:
-  GraphDatasetBase(OpKernelContext* ctx)
-      : op_name_(ctx->op_kernel().type_string()) {}
-
-  const string op_name() const { return op_name_; }
-
-  Status Save(OpKernelContext* ctx,
-              IteratorStateWriter* writer) const override {
-    string serialized_graph_def;
-    string output_node;
-    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-    return Status::OK();
-  }
-
-  // Key for storing the Dataset graph in the serialized format.
-  TF_EXPORT static const char kDatasetGraphKey[];
-
-  // Key for storing the output node of the Dataset graph in the serialized
-  // format.
-  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
 
  private:
-  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
-                   string* output_node) const;
-
-  const string op_name_;
+  const string name_;
 };
 
-// Represents an iterator that is associated with a particular parent dataset.
+// Represents an iterator that is associated with a particular dataset.
 class DatasetBaseIterator : public IteratorBase {
  public:
   struct BaseParams {
@@ -540,7 +564,7 @@
     return s;
   }
 
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+  Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
     TF_RETURN_IF_ERROR(params_.dataset->Save(ctx, writer));
     return IteratorBase::Save(ctx, writer);
   }
@@ -559,13 +583,13 @@
   BaseParams params_;
 };
 
-// Represents an iterator that is associated with a particular parent dataset
+// Represents an iterator that is associated with a particular dataset
 // with a particular type.
 template <class DatasetType>
 class DatasetIterator : public DatasetBaseIterator {
  public:
   struct Params {
-    // Borrowed pointer to the parent dataset.
+    // Borrowed pointer to the dataset.
     const DatasetType* dataset;
 
     // Identifies the sequence of iterators leading up to this iterator.
@@ -671,11 +695,36 @@
 // The ownership of `dataset` is transferred to `tensor`.
 Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
 
-namespace dataset {
+// A simple background worker that executes closures asynchronously and without
+// blocking.
+//
+// A `BackgroundWorker` is used to offload blocking work from an `AsyncOpKernel`
+// to avoid blocking an executor thread that may be required by the blocking
+// work.
+//
+// NOTE(mrry): We do not use a regular `tensorflow::thread::ThreadPool` for this
+// purpose because its current implementation (in Eigen) uses a finite-length
+// queue and will block the caller when full. This can lead to deadlock under
+// heavy load. Since the number of concurrent work items in each user of a
+// `BackgroundWorker` is at most one per op invocation, the dynamic allocation
+// overhead is tolerable.
+class BackgroundWorker {
+ public:
+  BackgroundWorker(Env* env, const string& name);
 
-IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+  ~BackgroundWorker();
 
-}  // namespace dataset
+  void Schedule(std::function<void()> work_item);
+
+ private:
+  void WorkerLoop();
+
+  std::unique_ptr<Thread> thread_;
+  mutex mu_;
+  condition_variable cond_var_;
+  bool cancelled_ GUARDED_BY(mu_) = false;
+  std::deque<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 57bcc0f..6b92e10 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -920,10 +920,12 @@
 
 FunctionLibraryDefinition::FunctionLibraryDefinition(
     const FunctionLibraryDefinition& other)
-    : default_registry_(other.default_registry_), func_grad_(other.func_grad_) {
+    : default_registry_(other.default_registry_) {
+  tf_shared_lock l(other.mu_);
   for (const auto& it : other.function_defs_) {
     TF_CHECK_OK(AddFunctionDef(it.second->fdef));
   }
+  func_grad_ = other.func_grad_;
 }
 
 FunctionLibraryDefinition::FunctionLibraryDefinition(
@@ -943,8 +945,19 @@
 
 FunctionLibraryDefinition::~FunctionLibraryDefinition() {}
 
-const FunctionDef* FunctionLibraryDefinition::Find(const string& name) const {
-  auto iter = function_defs_.find(name);
+bool FunctionLibraryDefinition::Contains(const string& func) const {
+  tf_shared_lock l(mu_);
+  return function_defs_.find(func) != function_defs_.end();
+}
+
+const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
+  tf_shared_lock l(mu_);
+  return FindHelper(func);
+}
+
+const FunctionDef* FunctionLibraryDefinition::FindHelper(
+    const string& func) const {
+  auto iter = function_defs_.find(func);
   if (iter == function_defs_.end()) {
     return nullptr;
   } else {
@@ -953,6 +966,7 @@
 }
 
 Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
+  mutex_lock l(mu_);
   bool added;
   return AddFunctionDefHelper(fdef, &added);
 }
@@ -984,6 +998,7 @@
 }
 
 Status FunctionLibraryDefinition::AddGradientDef(const GradientDef& grad) {
+  mutex_lock l(mu_);
   bool added;
   return AddGradientDefHelper(grad, &added);
 }
@@ -1009,13 +1024,17 @@
 
 Status FunctionLibraryDefinition::AddLibrary(
     const FunctionLibraryDefinition& other) {
+  // Clone `other` to ensure thread-safety (grabbing `other`'s lock for
+  // the duration of the function could lead to deadlock).
+  FunctionLibraryDefinition clone(other);
+  mutex_lock l(mu_);
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   std::vector<string> funcs;
   std::vector<string> funcs_with_grads;
   Status s;
   bool added;
-  for (auto iter : other.function_defs_) {
+  for (auto iter : clone.function_defs_) {
     s = AddFunctionDefHelper(iter.second->fdef, &added);
     if (!s.ok()) {
       Remove(funcs, funcs_with_grads);
@@ -1025,7 +1044,7 @@
       funcs.push_back(iter.second->fdef.signature().name());
     }
   }
-  for (auto iter : other.func_grad_) {
+  for (auto iter : clone.func_grad_) {
     GradientDef grad;
     grad.set_function_name(iter.first);
     grad.set_gradient_func(iter.second);
@@ -1045,6 +1064,7 @@
     const FunctionDefLibrary& lib_def) {
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
+  mutex_lock l(mu_);
   std::vector<string> funcs;
   std::vector<string> funcs_with_grads;
   Status s;
@@ -1072,6 +1092,15 @@
   return Status::OK();
 }
 
+Status FunctionLibraryDefinition::ReplaceFunction(const string& func,
+                                                  const FunctionDef& fdef) {
+  mutex_lock l(mu_);
+  bool added;
+  TF_RETURN_IF_ERROR(RemoveFunction(func));
+  TF_RETURN_IF_ERROR(AddFunctionDefHelper(fdef, &added));
+  return Status::OK();
+}
+
 Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
   if (i == function_defs_.end()) {
@@ -1106,11 +1135,17 @@
 }
 
 string FunctionLibraryDefinition::FindGradient(const string& func) const {
+  tf_shared_lock l(mu_);
+  return gtl::FindWithDefault(func_grad_, func, "");
+}
+
+string FunctionLibraryDefinition::FindGradientHelper(const string& func) const {
   return gtl::FindWithDefault(func_grad_, func, "");
 }
 
 Status FunctionLibraryDefinition::LookUp(
     const string& op, const OpRegistrationData** op_reg_data) const {
+  tf_shared_lock l(mu_);
   auto iter = function_defs_.find(op);
   if (iter != function_defs_.end()) {
     *op_reg_data = &iter->second->op_registration_data;
@@ -1134,18 +1169,22 @@
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
-  const string& grad_name = FindGradient(func_name);
-  // If 'func' has a user-defined gradient function, uses the grad
-  // function's attrs to see if noinline is specified. Otherwise,
-  // uses func's attrs.
-  if (!grad_name.empty()) {
-    return Find(grad_name);
+  {
+    tf_shared_lock l(mu_);
+    const string& grad_name = FindGradientHelper(func_name);
+    // If 'func' has a user-defined gradient function, uses the grad
+    // function's attrs to see if noinline is specified. Otherwise,
+    // uses func's attrs.
+    if (!grad_name.empty()) {
+      return FindHelper(grad_name);
+    }
+    return FindHelper(func_name);
   }
-  return Find(func_name);
 }
 
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
+  tf_shared_lock l(mu_);
   for (const auto& f : function_defs_) {
     *lib.add_function() = f.second->fdef;
   }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 5da9af7..edb7ed0 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -28,6 +28,7 @@
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -40,7 +41,7 @@
 class ResourceMgr;
 class Rendezvous;
 class ScopedStepContainer;
-class StepStatsCollector;
+class StepStatsCollectorInterface;
 class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
@@ -288,8 +289,11 @@
 
 // Helper to maintain a map between function names in a given
 // FunctionDefLibrary and function definitions.
+//
+// This class is thread-safe.
 class FunctionLibraryDefinition : public OpRegistryInterface {
  public:
+  // Note: This constructor grabs `lib_def`'s lock in shared mode.
   explicit FunctionLibraryDefinition(const FunctionLibraryDefinition& lib_def);
   FunctionLibraryDefinition(const OpRegistryInterface* default_registry,
                             const FunctionDefLibrary& lib_def);
@@ -298,9 +302,15 @@
   FunctionLibraryDefinition& operator=(const FunctionLibraryDefinition&) =
       delete;
 
+  // Returns True if the library contains `func`, False otherwise.
+  bool Contains(const string& func) const;
+
   // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
   // returns its definition proto.
-  const FunctionDef* Find(const string& func) const;
+  //
+  // NB: This function returns a borrowed pointer, which can be invalidated by a
+  // subsequent call to `ReplaceFunction()` with the given name.
+  const FunctionDef* Find(const string& func) const LOCKS_EXCLUDED(mu_);
 
   // Adds function definition 'fdef' to this function library.
   // Returns status 'ok' on success, or error otherwise. This is a no-op if
@@ -308,45 +318,45 @@
   // If 'fdef' is successfully added to the library, it will be accessible
   // from 'LookUp' and included in the proto returned by 'ToProto'.
   // This operation is atomic.
-  Status AddFunctionDef(const FunctionDef& fdef);
+  Status AddFunctionDef(const FunctionDef& fdef) LOCKS_EXCLUDED(mu_);
 
   // Adds gradient definition 'grad' to this function library.
   // This is a no-op if 'grad' already exists in this function library.
   // If 'grad' is successfully added, it will be accessible via 'FindGradient'
   // and included in the proto returned by 'ToProto'.
   // This operation is atomic.
-  Status AddGradientDef(const GradientDef& grad);
+  Status AddGradientDef(const GradientDef& grad) LOCKS_EXCLUDED(mu_);
 
-  // Remove function `func` from the library. Returns non-OK Status unless
-  // `func` is in the library.
-  Status RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. Returns non-OK Status
-  // unless `func` has a gradient.
-  Status RemoveGradient(const string& func);
+  // Replaces the function corresponding to `func` with `fdef`. Returns
+  // a non-OK status if "func" was not found in the library, OK otherwise.
+  Status ReplaceFunction(const string& func, const FunctionDef& fdef);
 
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-  Status AddLibrary(const FunctionLibraryDefinition& other);
+  Status AddLibrary(const FunctionLibraryDefinition& other) LOCKS_EXCLUDED(mu_);
 
   // Adds the functions and gradients in 'lib_def' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-  Status AddLibrary(const FunctionDefLibrary& lib_def);
+  Status AddLibrary(const FunctionDefLibrary& lib_def) LOCKS_EXCLUDED(mu_);
 
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
-  string FindGradient(const string& func) const;
+  string FindGradient(const string& func) const LOCKS_EXCLUDED(mu_);
 
   // OpRegistryInterface method. Useful for constructing a Graph.
   //
   // If "op" is defined in the library, returns its signature.
   // Otherwise, assume "op" is a primitive op and returns its op
   // signature and shape inference function.
+  //
+  // NB: This function outputs a borrowed pointer, which can be invalidated by a
+  // subsequent call to `ReplaceFunction()` with the given name.
   Status LookUp(const string& op_type_name,
-                const OpRegistrationData** op_reg_data) const override;
+                const OpRegistrationData** op_reg_data) const override
+      LOCKS_EXCLUDED(mu_);
 
   // Ops created for function arguments bear the name given by `kArgOp`; those
   // created for return values bear the name given by `kRetOp`.
@@ -370,9 +380,12 @@
   Status GetAttr(const Node& node, const string& attr, T* value) const;
 
   // Returns a proto representation of the state of this function library.
-  FunctionDefLibrary ToProto() const;
+  FunctionDefLibrary ToProto() const LOCKS_EXCLUDED(mu_);
 
-  size_t num_functions() const { return function_defs_.size(); }
+  size_t num_functions() const {
+    tf_shared_lock l(mu_);
+    return function_defs_.size();
+  }
 
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
@@ -388,24 +401,42 @@
     OpRegistrationData op_registration_data;
   };
 
+  const FunctionDef* FindHelper(const string& func) const
+      SHARED_LOCKS_REQUIRED(mu_);
+  string FindGradientHelper(const string& func) const
+      SHARED_LOCKS_REQUIRED(mu_);
+
   // Same as AddFunctionDef/AddGradientDef except these methods set
   // `added` to true if the `fdef`/`grad` were actually added to this.
-  Status AddFunctionDefHelper(const FunctionDef& fdef, bool* added);
-  Status AddGradientDefHelper(const GradientDef& grad, bool* added);
+  Status AddFunctionDefHelper(const FunctionDef& fdef, bool* added)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status AddGradientDefHelper(const GradientDef& grad, bool* added)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  mutable mutex mu_;
   const OpRegistryInterface* const default_registry_;
   gtl::FlatMap<string, std::unique_ptr<FunctionDefAndOpRegistration>>
-      function_defs_;
-  gtl::FlatMap<string, string> func_grad_;
+      function_defs_ GUARDED_BY(mu_);
+  gtl::FlatMap<string, string> func_grad_ GUARDED_BY(mu_);
 
   // Helper function for GetAttr. Returns the FunctionDef* to get the
   // attr from.
-  const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
+  const FunctionDef* GetAttrImpl(const NodeDef& ndef) const LOCKS_EXCLUDED(mu_);
 
-  // Remove all functions in `funcs` and all gradients of
-  // functions in `funcs_with_grads` from this library.
+  // Remove all functions in `funcs` and all gradients of functions in
+  // `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
-              const std::vector<string>& funcs_with_grads);
+              const std::vector<string>& funcs_with_grads)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Remove `func` from the library. Returns non-OK Status unless `func` is in
+  // the library. This should only be called when there is a guarantee that the
+  // function being removed hasn't been retrieved with `Find`.
+  Status RemoveFunction(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  Status RemoveGradient(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 };
 
 // Forward declare. Defined in common_runtime/function.h
@@ -456,7 +487,7 @@
 
     // This interface is EXPERIMENTAL and subject to change.
     //
-    // Instatiates the function using an executor of the given type. If empty,
+    // Instantiates the function using an executor of the given type. If empty,
     // the default TensorFlow executor will be used.
     string executor_type;
   };
@@ -496,7 +527,7 @@
     CancellationManager* cancellation_manager = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
     ScopedStepContainer* step_container = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
 
     std::function<void(std::function<void()>)>* runner = nullptr;
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index aab95b7..e752599 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -70,7 +70,7 @@
 class ResourceMgr;
 class ScopedStepContainer;
 class CollectiveExecutor;
-class StepStatsCollector;
+class StepStatsCollectorInterface;
 
 class OpKernel {
  public:
@@ -569,7 +569,7 @@
     CallFrameInterface* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -984,7 +984,7 @@
   std::function<void(std::function<void()>)>* runner() const {
     return params_->runner;
   }
-  StepStatsCollector* stats_collector() const {
+  StepStatsCollectorInterface* stats_collector() const {
     return params_->stats_collector;
   }
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 8d597e1..3e77028 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -950,8 +950,7 @@
     *val = t->scalar<int64>()();
     return Status::OK();
   } else {
-    return errors::InvalidArgument(
-        "Scalar input for dim size must be int32 or int64");
+    return errors::InvalidArgument("Scalar input must be int32 or int64.");
   }
 }
 
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 5f805f6..a82beb7 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -919,7 +919,13 @@
   // We have reached the right-most dimension of the tensor.
   if (dim_index == shape_size - 1) {
     for (int64 i = 0; i < element_count; i++) {
-      if (*data_index >= limit) return;
+      if (*data_index >= limit) {
+        // If not enough elements has been printed, append "...".
+        if (dim_index != 0 && i < element_count) {
+          strings::StrAppend(result, "...");
+        }
+        return;
+      }
       if (i > 0) strings::StrAppend(result, " ");
       strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
     }
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 80e168d..84a373c 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1260,6 +1260,13 @@
   EXPECT_EQ("", x.SummarizeValue(16));
 }
 
+TEST(SummarizeValue, INT32Dims) {
+  Tensor x = MkTensor<int>(DT_INT32, TensorShape({3, 4}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  EXPECT_EQ("[1 2 3...]...", x.SummarizeValue(3));
+  EXPECT_EQ("[1 2 3 4][5 6 7 8][9 10...]...", x.SummarizeValue(10));
+}
+
 TEST(SummarizeValue, FLOAT) {
   Tensor x = MkTensor<float>(DT_FLOAT, TensorShape({5}), {1, 2, 3, 4, 0});
   EXPECT_EQ("1 2 3 4 0", x.SummarizeValue(16));
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc
index c1a8a63..bec4171 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@@ -65,16 +65,37 @@
 static Node* AddZerosLike(Graph* g, NodeOut input) {
   DCHECK_LT(0, input.dtype());
   DCHECK_LT(input.dtype(), DT_FLOAT_REF);
-  NodeDef ndef;
-  ndef.set_name(g->NewName(kNodeLabel));
-  ndef.set_op("ZerosLike");
-  ndef.add_input(input.name());
-  AddNodeAttr("T", input.dtype(), &ndef);
-  Status s;
-  Node* ret = g->AddNode(ndef, &s);
-  TF_CHECK_OK(s);
-  g->AddEdge(input.node, input.index, ret, 0);
-  return ret;
+  if (input.dtype() == DT_RESOURCE) {
+    NodeDef read_def;
+    read_def.set_name(g->NewName("Read"));
+    read_def.set_op("ReadVariableOp");
+    read_def.add_input(input.name());
+    AddNodeAttr("dtype", DT_FLOAT, &read_def);
+    Status s;
+    Node* read = g->AddNode(read_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(input.node, input.index, read, 0);
+    NodeDef ndef;
+    ndef.set_name(g->NewName(kNodeLabel));
+    ndef.set_op("ZerosLike");
+    ndef.add_input(read_def.name());
+    AddNodeAttr("T", DT_FLOAT, &ndef);
+    Node* ret = g->AddNode(ndef, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(read, 0, ret, 0);
+    return ret;
+  } else {
+    NodeDef ndef;
+    ndef.set_name(g->NewName(kNodeLabel));
+    ndef.set_op("ZerosLike");
+    ndef.add_input(input.name());
+    AddNodeAttr("T", input.dtype(), &ndef);
+    Status s;
+    Node* ret = g->AddNode(ndef, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(input.node, input.index, ret, 0);
+    return ret;
+  }
 }
 
 static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index c22e0a3..833592c 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -43,7 +43,7 @@
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2211,7 +2211,7 @@
   return Status::OK();
 }
 
-#else   // INTEL_MKL_ML
+#else   // INTEL_MKL_ML_ONLY
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2418,6 +2418,9 @@
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.conv2d_grad_filter_with_bias =
         "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.conv3d = "Conv3D";
+    csinfo_.conv3d_grad_input = "Conv3DBackpropInputV2";
+    csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.identity = "Identity";
@@ -2468,18 +2471,27 @@
                       CopyAttrsConcatV2, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
-                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv,
                       AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d),
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
@@ -2614,6 +2626,9 @@
     string conv2d_grad_input;
     string conv2d_grad_filter;
     string conv2d_grad_filter_with_bias;
+    string conv3d;
+    string conv3d_grad_input;
+    string conv3d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
     string identity;
@@ -3086,7 +3101,7 @@
   static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
@@ -3571,14 +3586,13 @@
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
-                                           NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
+                                         NodeBuilder* nb) {
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
-  bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -3586,8 +3600,6 @@
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -3595,7 +3607,6 @@
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
-  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
 }
 
 void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
@@ -3896,7 +3907,7 @@
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
 
   // Copy attributes from Conv2D to Conv2DWithBias.
-  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+  CopyAttrsConv(const_cast<const Node*>(pred), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -4007,7 +4018,7 @@
   }
 
   // Copy attributes from Conv2DBackpropFilter.
-  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+  CopyAttrsConv(const_cast<const Node*>(fltr), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(fltr->def().device());
@@ -4474,7 +4485,7 @@
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index a41f586..e8bac84 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,7 +37,7 @@
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 namespace {
 
@@ -1898,7 +1898,7 @@
 
 }  // namespace
 
-#else  // INTEL_MKL_ML
+#else  // INTEL_MKL_ML_ONLY
 
 // NOTE: Unit tests in this file rely on a topological sorted graph for
 // printing. But since sibling nodes of a node in the topologically sorted graph
@@ -3582,7 +3582,7 @@
 
 }  // namespace
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 67b252c..ea7788f 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -21,39 +21,14 @@
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-
-// HostConst: forced to generate output on the host.
-// Only used by testlib; no op is registered for this kernel
-// externally (i.e., in array_ops.cc)
-REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp);
-REGISTER_KERNEL_BUILDER(
-    Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Register the HostConst Op
-// Returns a constant tensor on the host.  Useful for writing C++ tests
-// and benchmarks which run on GPU but require arguments pinned to the host.
-// Used by test::graph::HostConstant.
-// value: Attr `value` is the tensor to return.
-REGISTER_OP("HostConst")
-    .Output("output: dtype")
-    .Attr("value: tensor")
-    .Attr("dtype: type")
-    .SetShapeFn(shape_inference::UnknownShape);
-
 namespace test {
 namespace graph {
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index f241922..a9a1abf 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -103,6 +103,9 @@
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
   EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
+  // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
+  // grappler will not process "label", therefore we have 15 here instead
+  EXPECT_EQ(15, summary.num_ops_total);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
@@ -110,6 +113,7 @@
   //
   // Change to EXPECT_FALSE when the above TODOs are done:
   EXPECT_TRUE(summary.inaccurate);
+  EXPECT_EQ(0, summary.num_ops_with_unknown_shapes);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index fe8a876..e91f0cc 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -109,8 +109,16 @@
   int64 max_per_op_buffers;    // Sum of all buffers used by the ops.
   int64 max_per_op_streaming;  // Ignore largest input buffer, assuming it
                                // streams from main memory.
+
+  // Number of ops included in this Costs in total.
+  // Default initialized to be one.
+  int64 num_ops_total = 1;
   // If the time estimation is inaccurate.
   bool inaccurate = false;
+  // Number of ops that are estimated with unknown shapes.
+  int64 num_ops_with_unknown_shapes = 0;
+  // TODO(pcma): include a counter for total inaccurate ops and counters for
+  // other reasons causing the inaccuracy
 
   // Max possible memory usage per device.
   std::unordered_map<string, uint64> estimated_max_memory_per_device;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 5b303f6..0341d7f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -175,14 +175,24 @@
 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
                                       int rank, bool* found_unknown_shapes) {
   auto shape = original_shape;
-  if (shape.unknown_rank() || shape.dim_size() < rank) {
+  bool is_scalar = !shape.unknown_rank() && shape.dim_size() == 0;
+
+  if (shape.unknown_rank() || (!is_scalar && shape.dim_size() < rank)) {
     *found_unknown_shapes = true;
-    TensorShapeProto::Dim dim;
     VLOG(2) << "Use minimum shape because the rank is unknown.";
     // The size of each dimension is at least 1, if unknown.
-    dim.set_size(1);
+    for (int i = shape.dim_size(); i < rank; i++) {
+      shape.add_dim()->set_size(1);
+    }
+  } else if (is_scalar) {
     for (int i = 0; i < rank; i++) {
-      *shape.add_dim() = dim;
+      shape.add_dim()->set_size(1);
+    }
+  } else if (shape.dim_size() > rank) {
+    *found_unknown_shapes = true;
+    shape.clear_dim();
+    for (int i = 0; i < rank; i++) {
+      shape.add_dim()->set_size(original_shape.dim(i).size());
     }
   } else {
     for (int i = 0; i < shape.dim_size(); i++) {
@@ -449,6 +459,7 @@
   if (found_unknown_shapes || !is_known_elementwise_op) {
     costs.inaccurate = true;
   }
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -469,6 +480,7 @@
   const double total_io_bytes = input_size + output_size;
   Costs costs = PredictOpCountBasedCost(operations, total_io_bytes, op_info);
   costs.inaccurate = unknown_shapes;
+  costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;
   return costs;
 }
@@ -627,6 +639,7 @@
 
   if (op_features.inputs_size() < 2) {
     LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+    // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
@@ -694,11 +707,13 @@
     const OpInfo& op_features, bool* found_unknown_shapes) const {
   if (op_features.op() != kBatchMatMul) {
     LOG(ERROR) << "Invalid Operation: " << op_features.op();
+    // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
   if (op_features.inputs_size() != 2) {
     LOG(ERROR) << "Expected 2 inputs but got " << op_features.inputs_size();
+    // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
@@ -858,6 +873,7 @@
          "kDepthwiseConv2dNativeBackpropInput";
 
   if (op_features.inputs_size() < 2) {
+    // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
@@ -935,6 +951,7 @@
   }
 
   if (op_features.inputs_size() < 1) {
+    // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
@@ -1037,6 +1054,7 @@
   auto costs = PredictOpCountBasedCost(
       CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -1049,6 +1067,7 @@
                                   op_features, nullptr, &found_unknown_shapes),
                               op_features);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -1061,6 +1080,7 @@
                                   op_features, nullptr, &found_unknown_shapes),
                               op_features);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -1148,6 +1168,7 @@
   // Construct component operations and run the cost computation.
   auto costs = PredictFusedOp(op_context_with_output, component_ops);
   costs.inaccurate |= found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = costs.inaccurate;
   return costs;
 }
 
@@ -1157,6 +1178,7 @@
   auto costs = PredictOpCountBasedCost(
       CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -1171,6 +1193,7 @@
   VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
   result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  result.num_ops_with_unknown_shapes = result.inaccurate;
   // Assign the minimum amount of time we can represent to the identity op since
   // it tends to be really cheap.
   result.compute_time = kMinComputeTime;
@@ -1184,6 +1207,7 @@
   Costs result = Costs::ZeroCosts();
   result.persistent_memory =
       CalculateOutputSize(op_features, &result.inaccurate);
+  result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
   result.execution_time = result.execution_time;
@@ -1198,6 +1222,7 @@
       CountBatchMatMulOperations(op_features, &found_unknown_shapes),
       op_features);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
@@ -1205,6 +1230,7 @@
   const auto& op_features = op_context.op_info;
   Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
+  costs.num_ops_with_unknown_shapes = costs.inaccurate;
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
   costs.compute_time = kMinComputeTime;
@@ -1249,6 +1275,7 @@
   const double total_io = input_size + output_size;
   Costs costs = PredictOpCountBasedCost(op_count, total_io, op_info);
   costs.inaccurate = unknown_shapes;
+  costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;
 
   return costs;
@@ -1390,6 +1417,7 @@
   Costs costs = PredictOpCountBasedCost(
       ops, total_input_size + total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
@@ -1432,6 +1460,7 @@
   Costs costs = PredictOpCountBasedCost(
       ops, total_input_size + total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
@@ -1464,6 +1493,7 @@
   Costs costs = PredictOpCountBasedCost(
       ops, total_input_size + total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
@@ -1516,6 +1546,7 @@
   Costs costs = PredictOpCountBasedCost(
       ops, total_input_size + total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
@@ -1562,6 +1593,7 @@
       ops, total_input_size + total_output_size + total_internal_read_size,
       op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
@@ -1595,6 +1627,7 @@
       ops, total_input_size + total_output_size + total_internal_read_size,
       op_info);
   costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 77352f6..9e57909 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -488,7 +488,9 @@
   EXPECT_EQ(Costs::Duration(130), cost.memory_time);
   EXPECT_EQ(Costs::Duration(16), cost.compute_time);
   EXPECT_EQ(Costs::Duration(146), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
@@ -504,7 +506,9 @@
   EXPECT_EQ(Costs::Duration(0), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
@@ -522,7 +526,9 @@
   EXPECT_EQ(Costs::Duration(81), cost.memory_time);
   EXPECT_EQ(Costs::Duration(10), cost.compute_time);
   EXPECT_EQ(Costs::Duration(91), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
@@ -530,7 +536,9 @@
   EXPECT_EQ(Costs::Duration(8400), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1000), cost.compute_time);
   EXPECT_EQ(Costs::Duration(9400), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
@@ -538,7 +546,9 @@
   EXPECT_EQ(Costs::Duration(233780), cost.memory_time);
   EXPECT_EQ(Costs::Duration(354877440), cost.compute_time);
   EXPECT_EQ(Costs::Duration(355111220), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
@@ -547,7 +557,9 @@
   EXPECT_EQ(Costs::Duration(112340), cost.memory_time);
   EXPECT_EQ(Costs::Duration(4158720), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4271060), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
@@ -555,7 +567,9 @@
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
@@ -564,7 +578,9 @@
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
@@ -576,7 +592,9 @@
   EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
   EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
@@ -586,7 +604,9 @@
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
@@ -596,7 +616,9 @@
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
@@ -606,7 +628,9 @@
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
@@ -616,7 +640,9 @@
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 // TODO(yaozhang): Update once NCHW_VECT_C is supported.
@@ -627,7 +653,9 @@
   EXPECT_EQ(Costs::Duration(0), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 // TODO(yaozhang): Update once OIHW_VECT_I is supported.
@@ -638,7 +666,9 @@
   EXPECT_EQ(Costs::Duration(0), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
@@ -646,7 +676,9 @@
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
@@ -654,7 +686,9 @@
   EXPECT_EQ(Costs::Duration(3600), cost.memory_time);
   EXPECT_EQ(Costs::Duration(400), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4000), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
@@ -662,7 +696,9 @@
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1600), cost.compute_time);
   EXPECT_EQ(Costs::Duration(3600), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ReluExecutionTime) {
@@ -670,28 +706,77 @@
   EXPECT_EQ(Costs::Duration(800), cost.memory_time);
   EXPECT_EQ(Costs::Duration(100), cost.compute_time);
   EXPECT_EQ(Costs::Duration(900), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
 TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
-  EXPECT_FALSE(PredictCosts(DescribeMatMul(2, 4, 7, 7)).inaccurate);
-  EXPECT_TRUE(PredictCosts(DescribeMatMul(-1, 4, 7, 7)).inaccurate);
-  EXPECT_TRUE(PredictCosts(DescribeMatMul(2, 4, -1, 7)).inaccurate);
-
-  EXPECT_FALSE(PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256))
-                   .inaccurate);
-  EXPECT_TRUE(PredictCosts(DescribeConvolution(16, -1, 19, 48, 48, 5, 5, 256))
-                  .inaccurate);
+  {
+    auto cost = PredictCosts(DescribeMatMul(2, 4, 7, 7));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeMatMul(-1, 4, 7, 7));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeMatMul(2, 4, -1, 7));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost =
+        PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost =
+        PredictCosts(DescribeConvolution(16, -1, 19, 48, 48, 5, 5, 256));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
 }
 
 TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
-  EXPECT_TRUE(PredictCosts(DescribeBatchMatMul({}, {})).inaccurate);
-  EXPECT_TRUE(PredictCosts(DescribeBatchMatMul({2, 4}, {})).inaccurate);
-  EXPECT_FALSE(PredictCosts(DescribeBatchMatMul({2, 4}, {4, 2})).inaccurate);
-  EXPECT_FALSE(
-      PredictCosts(DescribeBatchMatMul({1, 2, 4}, {1, 4, 2})).inaccurate);
-  EXPECT_FALSE(
-      PredictCosts(DescribeBatchMatMul({2, 4}, {1, 3, 4, 2})).inaccurate);
+  {
+    auto cost = PredictCosts(DescribeBatchMatMul({}, {}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeBatchMatMul({2, 4}, {}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeBatchMatMul({2, 4}, {4, 2}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeBatchMatMul({1, 2, 4}, {1, 4, 2}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost = PredictCosts(DescribeBatchMatMul({2, 4}, {1, 3, 4, 2}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
   bool matmul_inaccurate = false;
   bool batch_matmul_inaccurate = false;
   EXPECT_EQ(
@@ -813,7 +898,9 @@
     EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
     EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -821,7 +908,9 @@
     EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
     EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
     EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 2x2 window with 3x3 stride.
@@ -829,7 +918,9 @@
     EXPECT_EQ(Costs::Duration(561792), costs.execution_time);
     EXPECT_EQ(Costs::Duration(56448), costs.compute_time);
     EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
 
@@ -849,7 +940,9 @@
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
     EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -857,7 +950,9 @@
     EXPECT_EQ(Costs::Duration(1536000), costs.execution_time);
     EXPECT_EQ(Costs::Duration(153600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 2x2 window with 3x3 stride.
@@ -865,7 +960,9 @@
     EXPECT_EQ(Costs::Duration(1514112), costs.execution_time);
     EXPECT_EQ(Costs::Duration(210048), costs.compute_time);
     EXPECT_EQ(Costs::Duration(1304064), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
 
@@ -884,7 +981,9 @@
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -892,7 +991,9 @@
     EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
     EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
     EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 2x2 window with 3x3 stride.
@@ -900,7 +1001,9 @@
     EXPECT_EQ(Costs::Duration(580608), costs.execution_time);
     EXPECT_EQ(Costs::Duration(75264), costs.compute_time);
     EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
 
@@ -920,7 +1023,9 @@
     EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
     EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -928,7 +1033,9 @@
     EXPECT_EQ(Costs::Duration(960002), costs.execution_time);
     EXPECT_EQ(Costs::Duration(192000), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
   {
     // 2x2 window with 3x3 stride.
@@ -936,7 +1043,9 @@
     EXPECT_EQ(Costs::Duration(862082), costs.execution_time);
     EXPECT_EQ(Costs::Duration(172416), costs.compute_time);
     EXPECT_EQ(Costs::Duration(689666), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
 
@@ -953,7 +1062,9 @@
     EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
     EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
     EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 
   {
@@ -961,7 +1072,9 @@
     EXPECT_EQ(Costs::Duration(204913), costs.execution_time);
     EXPECT_EQ(Costs::Duration(51236), costs.compute_time);
     EXPECT_EQ(Costs::Duration(153677), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 
   {
@@ -969,7 +1082,9 @@
     EXPECT_EQ(Costs::Duration(384154), costs.execution_time);
     EXPECT_EQ(Costs::Duration(76800), costs.compute_time);
     EXPECT_EQ(Costs::Duration(307354), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 
   {
@@ -978,6 +1093,8 @@
     EXPECT_EQ(Costs::Duration(25600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(102452), costs.memory_time);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
 
@@ -994,7 +1111,9 @@
     EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
     EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
     EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 
   {
@@ -1002,7 +1121,81 @@
     EXPECT_EQ(Costs::Duration(6503809), costs.execution_time);
     EXPECT_EQ(Costs::Duration(2649677), costs.compute_time);
     EXPECT_EQ(Costs::Duration(3854132), costs.memory_time);
+    EXPECT_EQ(1, costs.num_ops_total);
     EXPECT_FALSE(costs.inaccurate);
+    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, MaybeGetMinimumShape) {
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(true);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({1, 1, 1, 1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 1, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({1, 1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({10, 20}, y);
+
+    unknown_shapes = false;
+    TensorShapeProto z = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    EXPECT_EQ(4, z.dim_size());
+    ExpectTensorShape({10, 20, 1, 1}, z);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    x.add_dim()->set_size(-1);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({10, 20, 1, 20}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    x.add_dim()->set_size(30);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({10, 20}, y);
   }
 }
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index f31d22e..6e3ebde 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -47,9 +47,11 @@
   result.execution_time += right.execution_time;
   result.compute_time += right.compute_time;
   result.memory_time += right.memory_time;
-  if (right.inaccurate) {
-    result.inaccurate = true;
-  }
+
+  result.num_ops_total += right.num_ops_total;
+  if (right.inaccurate) result.inaccurate = true;
+  result.num_ops_with_unknown_shapes += right.num_ops_with_unknown_shapes;
+
   if (right.max_memory != kMemoryUnknown) {
     result.max_memory += right.max_memory;
   }
@@ -283,6 +285,7 @@
       grappler_item_(grappler_item),
       use_static_shapes_(use_static_shapes),
       placer_(cluster) {
+  graph_costs_.num_ops_total = 0;
   initialized_ = false;
 }
 
@@ -845,6 +848,11 @@
 }
 
 Costs VirtualScheduler::Summary() const {
+  // Overall statement about accuracy
+  VLOG(1) << graph_costs_.num_ops_total << " ops processed in total, with "
+          << graph_costs_.num_ops_with_unknown_shapes
+          << " having unknown shapes";
+
   // Print out basic execution summary.
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
   VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
@@ -906,6 +914,12 @@
             << ", at the end: "
             << strings::HumanReadableNumBytes(state.memory_usage);
 
+    // Overall statement about accuracy
+    VLOG(1) << state.device_costs.num_ops_total
+            << " ops processed in total, with "
+            << state.device_costs.num_ops_with_unknown_shapes
+            << " having unknown shapes";
+
     VLOG(1) << "Per-op execution time / compute time / memory time "
                "(and memory usage at peak memory usage):";
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 353ca6f..0e66e8a 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -114,6 +114,7 @@
 
   DeviceState() {
     device_costs = Costs::ZeroCosts();
+    device_costs.num_ops_total = 0;
     memory_usage = 0;
     max_memory_usage = 0;
   }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index f9154e4..b1373d8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -942,7 +942,6 @@
   // target_node.
   std::unordered_map<string, OpContext> RunScheduler(
       const string& target_node) {
-    Costs zero_costs = Costs::ZeroCosts();
     std::unordered_map<string, OpContext> ops_executed;
     bool more_nodes = true;
     do {
@@ -1632,6 +1631,9 @@
   // Misc - 5 * 1us
   // Total: 13000005
   EXPECT_EQ(13000005, c.execution_time.asMicroSeconds().count());
+  EXPECT_EQ(grappler_item_->graph.node_size(), c.num_ops_total);
+  EXPECT_FALSE(c.inaccurate);
+  EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
 }
 
 // Like the above SummaryCostTest, but makes sure the stepstats timeline is
@@ -1645,6 +1647,9 @@
   Costs c = scheduler_->Summary(&metadata);
   StepStats stepstats = metadata.step_stats();
   EXPECT_EQ(13000005, c.execution_time.asMicroSeconds().count());
+  EXPECT_EQ(grappler_item_->graph.node_size(), c.num_ops_total);
+  EXPECT_FALSE(c.inaccurate);
+  EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
 
   // Should only be 1 device!
   EXPECT_EQ(1, stepstats.dev_stats().size());
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 96f6fe1..e778b78 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -35,6 +35,7 @@
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -102,57 +103,57 @@
 Status MetaOptimizer::InitializeOptimizers(
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
   if (!cfg_.disable_model_pruning()) {
-    optimizers->emplace_back(new ModelPruner());
+    optimizers->push_back(MakeUnique<ModelPruner>());
   }
   if (cfg_.function_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new FunctionOptimizer(cfg_.function_optimization()));
+    optimizers->push_back(
+        MakeUnique<FunctionOptimizer>(cfg_.function_optimization()));
   }
   if (cfg_.debug_stripper() == RewriterConfig::ON) {
-    optimizers->emplace_back(new DebugStripper());
+    optimizers->push_back(MakeUnique<DebugStripper>());
   }
   if (cfg_.constant_folding() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new ConstantFolding(cfg_.constant_folding(), cpu_device_));
+    optimizers->push_back(
+        MakeUnique<ConstantFolding>(cfg_.constant_folding(), cpu_device_));
   }
   if (cfg_.shape_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(new ShapeOptimizer());
+    optimizers->push_back(MakeUnique<ShapeOptimizer>());
   }
   if (cfg_.remapping() != RewriterConfig::OFF) {
-    optimizers->emplace_back(new Remapper(cfg_.remapping()));
+    optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
   }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
+    optimizers->push_back(
+        MakeUnique<ArithmeticOptimizer>(cfg_.arithmetic_optimization()));
   }
   if (cfg_.loop_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new LoopOptimizer(cfg_.loop_optimization(), cpu_device_));
+    optimizers->push_back(
+        MakeUnique<LoopOptimizer>(cfg_.loop_optimization(), cpu_device_));
   }
   if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
-    optimizers->emplace_back(
-        new DependencyOptimizer(cfg_.dependency_optimization()));
+    optimizers->push_back(
+        MakeUnique<DependencyOptimizer>(cfg_.dependency_optimization()));
   }
   if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-    optimizers->emplace_back(new LayoutOptimizer());
+    optimizers->push_back(MakeUnique<LayoutOptimizer>());
   }
   if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
     if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
-      optimizers->emplace_back(
+      optimizers->push_back(
           // Use the default target node name prefix "gradients/"
-          new MemoryOptimizer(cfg_.memory_optimization()));
+          MakeUnique<MemoryOptimizer>(cfg_.memory_optimization()));
     } else {
-      optimizers->emplace_back(
-          new MemoryOptimizer(cfg_.memory_optimization(),
-                              cfg_.memory_optimizer_target_node_name_scope()));
+      optimizers->push_back(MakeUnique<MemoryOptimizer>(
+          cfg_.memory_optimization(),
+          cfg_.memory_optimizer_target_node_name_scope()));
     }
   }
   if (cfg_.auto_parallel().enable()) {
-    optimizers->emplace_back(
-        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+    optimizers->push_back(
+        MakeUnique<AutoParallel>(cfg_.auto_parallel().num_replicas()));
   }
   if (cfg_.scoped_allocator_optimization()) {
-    optimizers->emplace_back(new ScopedAllocatorOptimizer(
+    optimizers->push_back(MakeUnique<ScopedAllocatorOptimizer>(
         cfg_.scoped_allocator_optimization(), cfg_.scoped_allocator_opts()));
   }
   return Status::OK();
@@ -382,8 +383,7 @@
       TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
 
       // Replace optimized function with a new FunctionDef.
-      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
-      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
+      TF_RETURN_IF_ERROR(flib.ReplaceFunction(func_name, optimized_func));
     }
 
     // If optimized at least one function, update the graph library.
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index fd71406..462b752 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -303,12 +303,14 @@
 }
 
 GrapplerFunctionItem::GrapplerFunctionItem(
-    const string& func_name, const AttrValueMap& func_attr,
+    const string& func_name, const string& description,
+    const AttrValueMap& func_attr,
     const std::vector<InputArgExpansion>& input_arg_expansions,
     const std::vector<OutputArgExpansion>& output_arg_expansions,
     const std::vector<string>& keep_nodes, bool is_stateful,
     GraphDef&& function_body)
-    : func_attr_(func_attr),
+    : description_(description),
+      func_attr_(func_attr),
       input_arg_expansions_(input_arg_expansions),
       output_arg_expansions_(output_arg_expansions),
       is_stateful_(is_stateful) {
@@ -337,6 +339,8 @@
   }
 }
 
+const string& GrapplerFunctionItem::description() const { return description_; }
+
 const std::vector<InputArgExpansion>& GrapplerFunctionItem::inputs() const {
   return input_arg_expansions_;
 }
@@ -589,7 +593,7 @@
   bool is_stateful = signature.is_stateful();
 
   *item = GrapplerFunctionItem(
-      /*func_name=*/signature.name(),
+      /*func_name=*/signature.name(), /*description=*/signature.description(),
       /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
       inputs, outputs, keep_nodes, is_stateful, std::move(function_body));
   return Status::OK();
@@ -674,6 +678,7 @@
                        const FunctionLibraryDefinition& flib,
                        FunctionDef* func) {
   func->mutable_signature()->set_name(item.id);
+  func->mutable_signature()->set_description(item.description());
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 6227daa..9f607dc 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -137,12 +137,15 @@
  public:
   GrapplerFunctionItem() = default;
   GrapplerFunctionItem(
-      const string& func_name, const AttrValueMap& func_attr,
+      const string& func_name, const string& description,
+      const AttrValueMap& func_attr,
       const std::vector<InputArgExpansion>& input_arg_expansions,
       const std::vector<OutputArgExpansion>& output_arg_expansions,
       const std::vector<string>& keep_nodes, bool is_stateful,
       GraphDef&& function_body);
 
+  const string& description() const;
+
   bool IsInputPlaceholder(const string& node_name) const;
 
   const std::vector<InputArgExpansion>& inputs() const;
@@ -165,6 +168,7 @@
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
 
+  string description_;
   AttrValueMap func_attr_;  // Attributes specific to function definition that
                             // produced this item (FuncDef.attr field).
 
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 8c3cc70..b2d059e 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -734,6 +734,33 @@
   EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
 }
 
+TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "DoNothing",
+      // Args
+      {"i: int32"},
+      // Return values
+      {"o: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"o"}, "Identity", {"i"}, {{"T", DT_INT32}}}});
+
+  constexpr char description[] = "This is a helpful description.";
+  func.mutable_signature()->set_description(description);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_INT32);
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  FunctionDef func2;
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &func2));
+  EXPECT_TRUE(FunctionDefsEqual(func, func2));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index ed690fb..bb17511 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -52,6 +52,8 @@
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "if_mkl_ml",
+    "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
@@ -628,6 +630,7 @@
         ":gather_nd_op",
         ":gather_op",
         ":guarantee_const_op",
+        ":host_constant_op",
         ":identity_n_op",
         ":identity_op",
         ":inplace_ops",
@@ -650,7 +653,14 @@
         ":split_v_op",
         ":strided_slice_op",
         ":tile_ops",
-        ":transpose_op",
+    ] + if_mkl(
+        [
+            ":mkl_transpose_op",
+        ],
+        [
+            ":transpose_op",
+        ],
+    ) + [
         ":unique_op",
         ":unpack_op",
         ":unravel_index_op",
@@ -695,6 +705,12 @@
 )
 
 tf_kernel_library(
+    name = "host_constant_op",
+    prefix = "host_constant_op",
+    deps = ARRAY_DEPS,
+)
+
+tf_kernel_library(
     name = "diag_op",
     prefix = "diag_op",
     deps = ARRAY_DEPS,
@@ -887,18 +903,24 @@
     deps = ARRAY_DEPS,
 )
 
-tf_kernel_library(
-    name = "transpose_op",
-    srcs = [
-        "transpose_op.cc",
-    ] + if_mkl([
-        "mkl_transpose_op.cc",
-    ]),
-    hdrs = ["transpose_op.h"],
-    deps = ARRAY_DEPS + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]),
+if_mkl(
+    [tf_mkl_kernel_library(
+        name = "mkl_transpose_op",
+        srcs = [
+            "mkl_transpose_op.cc",
+            "transpose_op.cc",
+        ],
+        hdrs = ["transpose_op.h"],
+        deps = ARRAY_DEPS + mkl_deps(),
+    )],
+    [tf_kernel_library(
+        name = "transpose_op",
+        srcs = [
+            "transpose_op.cc",
+        ],
+        hdrs = ["transpose_op.h"],
+        deps = ARRAY_DEPS,
+    )],
 )
 
 tf_kernel_library(
@@ -1286,6 +1308,7 @@
     srcs = ["gather_nd_op_test.cc"],
     deps = [
         ":gather_nd_op",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -2542,6 +2565,7 @@
     # allow multiple definitions when linking this.
     linkopts = select({
         "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
         "//conditions:default": ["-Wl,-z,muldefs"],
     }),
     visibility = [":friends"],
@@ -2851,14 +2875,16 @@
 
 tf_kernel_library(
     name = "batch_matmul_op",
-    srcs = [] + if_mkl([
+    srcs = if_mkl_ml([
         "mkl_batch_matmul_op.cc",
     ]),
+    # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
+    hdrs = ["batch_matmul_op_impl.h"],
     # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
     # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
     copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS + if_mkl([
+    deps = MATH_DEPS + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
 )
@@ -2941,10 +2967,7 @@
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }) + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]) + if_cuda([
+    }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
@@ -3154,6 +3177,7 @@
         "//conditions:default": [],
     }),
     deps = [
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         ":reduction_ops",
@@ -3289,6 +3313,7 @@
     srcs = ["diag_op_test.cc"],
     deps = [
         ":diag_op",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -3616,6 +3641,7 @@
     name = "nn_ops_test",
     srcs = ["nn_ops_test.cc"],
     deps = [
+        ":host_constant_op",
         ":nn",
         ":ops_testutil",
         ":ops_util",
@@ -3763,6 +3789,7 @@
     srcs = ["spacetobatch_benchmark_test.cc"],
     deps = [
         ":batch_space_ops",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -3902,6 +3929,7 @@
     size = "small",
     srcs = ["random_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":random_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4156,6 +4184,7 @@
         "sparse_xent_op_test.cc",
     ],
     deps = [
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         ":sparse",
@@ -4369,6 +4398,7 @@
         ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
+        ":string_length_op",
         ":string_split_op",
         ":string_strip_op",
         ":string_to_hash_bucket_op",
@@ -4404,6 +4434,12 @@
 )
 
 tf_kernel_library(
+    name = "string_length_op",
+    prefix = "string_length_op",
+    deps = STRING_DEPS,
+)
+
+tf_kernel_library(
     name = "regex_full_match_op",
     prefix = "regex_full_match_op",
     deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
@@ -4494,6 +4530,7 @@
     size = "small",
     srcs = ["multinomial_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":multinomial_op",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -4521,6 +4558,7 @@
     size = "small",
     srcs = ["parameterized_truncated_normal_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":ops_util",
         ":parameterized_truncated_normal_op",
         "//tensorflow/core:core_cpu",
@@ -4887,6 +4925,7 @@
         "fill_functor.cc",
         "fill_functor.h",
         "function_ops.cc",
+        "function_ops.h",
         "gather_functor.h",
         "gather_nd_op.cc",
         "gather_nd_op.h",
@@ -5378,6 +5417,18 @@
     alwayslink = 1,
 )
 
+cc_library(
+    name = "android_whole_file_read_ops",
+    srcs = if_android(["whole_file_read_ops.cc"]),
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:android_tensorflow_lib_lite",
+    ],
+    alwayslink = 1,
+)
+
 #   Quantization-specific OpKernels
 
 tf_kernel_library(
@@ -6121,8 +6172,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6136,8 +6186,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6152,8 +6201,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6172,8 +6220,7 @@
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6188,8 +6235,7 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6204,56 +6250,43 @@
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = NN_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_aggregate_ops",
     prefix = "mkl_aggregate_ops",
-    deps = MATH_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = MATH_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_reshape_op",
     prefix = "mkl_reshape_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_identity_op",
     prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_lrn_op",
     prefix = "mkl_lrn_op",
-    deps = NN_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = NN_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6264,10 +6297,7 @@
         "cwise_ops_gradients.h",
     ],
     prefix = "mkl_cwise_ops_common",
-    deps = NN_DEPS + [
-        "cwise_op",
-        "//third_party/mkl:intel_binary_blob",
-    ],
+    deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
 # NOTE(lespeholt): This rule is deprecated, please use:
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index a7757d1..e6d6c40 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -47,6 +47,7 @@
       case DT_FLOAT:
       case DT_DOUBLE:
       case DT_COMPLEX64:
+      case DT_COMPLEX128:
         break;
       default:
         OP_REQUIRES(ctx, !(scientific || shortest),
@@ -83,6 +84,7 @@
       case DT_FLOAT:
       case DT_DOUBLE:
       case DT_COMPLEX64:
+      case DT_COMPLEX128:
         if (shortest) {
           strings::Appendf(&format_, "g");
         } else if (scientific) {
@@ -100,7 +102,7 @@
                                             DataTypeString(dtype)));
     }
 
-    if (dtype == DT_COMPLEX64) {
+    if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
       format_ = strings::Printf("(%s,%s)", format_.c_str(), format_.c_str());
     }
   }
@@ -144,6 +146,13 @@
               format_.c_str(), input_flat(i).real(), input_flat(i).imag());
         }
       } break;
+      case (DT_COMPLEX128): {
+        const auto& input_flat = input_tensor->flat<complex128>();
+        for (int i = 0; i < input_flat.size(); ++i) {
+          output_flat(i) = strings::Printf(
+              format_.c_str(), input_flat(i).real(), input_flat(i).imag());
+        }
+      } break;
       default:
         bool can_encode_type = false;
         OP_REQUIRES(context, can_encode_type,
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/batch_matmul_op_complex.cc
index b77c80c..54c45bf 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_complex.cc
@@ -17,7 +17,7 @@
 
 namespace tensorflow {
 
-#if !defined(INTEL_MKL) || defined(DO_NOT_USE_ML)
+#if !defined(INTEL_MKL) || defined(INTEL_MKL_DNN_ONLY)
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_CPU);
 #endif
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 475bda8..766713a 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -15,6 +15,9 @@
 
 // See docs in ../ops/math_ops.cc.
 
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+
 #define EIGEN_USE_THREADS
 
 #include <vector>
@@ -613,3 +616,5 @@
       BatchMatMul<SYCLDevice, TYPE>)
 #endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index fe259c1..584b507 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -21,7 +21,7 @@
 
 namespace tensorflow {
 
-#if !defined(INTEL_MKL) || defined(DO_NOT_USE_ML)
+#if !defined(INTEL_MKL) || defined(INTEL_MKL_DNN_ONLY)
 TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
 #endif
@@ -31,8 +31,7 @@
 #if GOOGLE_CUDA
 TF_CALL_float(REGISTER_BATCH_MATMUL_GPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
-// TODO(csigg): Implement Stream::ThenBlasGemv for Eigen::half and uncomment.
-// TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
+TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 720b506..29f3a42 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+
 #define EIGEN_USE_THREADS
 
 #include <vector>
@@ -162,3 +165,5 @@
 }
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 902327a..ff62983 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -66,16 +66,17 @@
     // In case of ConcatV2, "axis" could be int32 or int64
     if (AxisArgName == NAME_IS_AXIS) {
       OP_REQUIRES(
-          c, (concat_dim_tensor->dtype() == DT_INT32 ||
-              concat_dim_tensor->dtype() == DT_INT64),
+          c,
+          (concat_dim_tensor->dtype() == DT_INT32 ||
+           concat_dim_tensor->dtype() == DT_INT64),
           errors::InvalidArgument(axis_attribute_name,
                                   " tensor should be int32 or int64, but got ",
-                                  concat_dim_tensor->dtype()));
+                                  DataTypeString(concat_dim_tensor->dtype())));
     } else {
       OP_REQUIRES(c, (concat_dim_tensor->dtype() == DT_INT32),
-                  errors::InvalidArgument(axis_attribute_name,
-                                          " tensor should be int32, but got ",
-                                          concat_dim_tensor->dtype()));
+                  errors::InvalidArgument(
+                      axis_attribute_name, " tensor should be int32, but got ",
+                      DataTypeString(concat_dim_tensor->dtype())));
     }
     if (concat_dim_tensor->dtype() == DT_INT32) {
       concat_dim =
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index a888422..426c404 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -140,44 +140,6 @@
 #undef REGISTER_SYCL_KERNEL
 #endif
 
-HostConstantOp::HostConstantOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
-  const TensorProto* proto = nullptr;
-  AllocatorAttributes alloc_attr;
-  alloc_attr.set_on_host(true);
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
-  OP_REQUIRES_OK(
-      ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
-  OP_REQUIRES(
-      ctx, ctx->output_type(0) == tensor_.dtype(),
-      errors::InvalidArgument("Type mismatch between value (",
-                              DataTypeString(tensor_.dtype()), ") and dtype (",
-                              DataTypeString(ctx->output_type(0)), ")"));
-}
-
-void HostConstantOp::Compute(OpKernelContext* ctx) {
-  ctx->set_output(0, tensor_);
-}
-
-#if GOOGLE_CUDA
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        HostConstantOp);
-#endif
-
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -297,8 +259,9 @@
           errors::InvalidArgument("ZerosLike non-scalar Tensor with "
                                   "dtype=DT_VARIANT is not supported."));
       const Variant& v = input.scalar<Variant>()();
-      Tensor out(ctx->device()->GetAllocator(AllocatorAttributes()), DT_VARIANT,
-                 TensorShape({}));
+      // DT_VARIANT tensors must be allocated on CPU since they wrap C++
+      // objects which can not be efficiently represented in GPU memory.
+      Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
       Variant* out_v = &(out.scalar<Variant>()());
       OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>(
                               ctx, ZEROS_LIKE_VARIANT_UNARY_OP, v, out_v));
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
index b98153e..77ba441 100644
--- a/tensorflow/core/kernels/constant_op.h
+++ b/tensorflow/core/kernels/constant_op.h
@@ -13,8 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONSTANT_OP_H_
-#define TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -36,20 +36,6 @@
   TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp);
 };
 
-// HostConstantOp differs from ConstantOp in that its output is always
-// in host memory.
-class HostConstantOp : public OpKernel {
- public:
-  explicit HostConstantOp(OpKernelConstruction* ctx);
-  void Compute(OpKernelContext* ctx) override;
-  bool IsExpensive() override { return false; }
-  ~HostConstantOp() override {}
-
- private:
-  Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp);
-};
-
 class PlaceholderOp : public OpKernel {
  public:
   explicit PlaceholderOp(OpKernelConstruction* ctx);
@@ -61,4 +47,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index a6baae7..0faad11 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -60,6 +60,7 @@
   std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, device.get(),
                                               cpu_allocator(), const_node,
                                               TF_GRAPH_DEF_VERSION, &status));
+  TF_ASSERT_OK(status);
 
   OpKernelContext::Params params;
   params.device = device.get();
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 5bf709a..fc0a2f1 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -63,7 +63,7 @@
     return errors::InvalidArgument(
         label, ": Size of out_backprop doesn't match computed: ", "actual = ",
         dim->output_size, ", computed = ", out_size,
-        "spatial_dim: ", spatial_dim, " input: ", dim->input_size,
+        " spatial_dim: ", spatial_dim, " input: ", dim->input_size,
         " filter: ", dim->filter_size, " output: ", dim->output_size,
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index b2e8ee2..2c30d03 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -14,6 +14,9 @@
 ==============================================================================
 */
 
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+
 // This header declares the class CudaSolver, which contains wrappers of linear
 // algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
 // kernels.
@@ -433,3 +436,5 @@
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index b12652f..d6a2403 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -24,6 +24,9 @@
           int32, int64);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
+REGISTER5(BinaryOp, CPU, "UnsafeDiv", functor::unsafe_div, float, double, int16,
+          int32, int64);
+
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
           uint16, int16, int64, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index e259daa..d6988a5 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -22,6 +22,7 @@
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/platform/prefetch.h"
 
 namespace tensorflow {
 
@@ -32,6 +33,11 @@
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
+namespace functor {
+template <typename Device, typename T>
+struct SelectScalarHandler;
+}  // namespace functor
+
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
  public:
@@ -130,16 +136,8 @@
             then->shape().DebugString(), " vs. ",
             else_->shape().DebugString()));
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
-                            {"t", "e"}, "output", then->shape(), &output));
-
-    if (output->NumElements() > 0) {
-      functor::SelectScalarFunctor<Device, T> func;
-      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
-      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
-           then->flat<T>(), else_->flat<T>());
-    }
+    functor::SelectScalarHandler<Device, T> handler;
+    handler(ctx, cond, then, else_);
   }
 
  private:
@@ -208,6 +206,40 @@
 #endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
+struct SelectScalarHandler {
+  void operator()(OpKernelContext* ctx, const Tensor* cond, const Tensor* then,
+                  const Tensor* else_) {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", then->shape(), &output));
+
+    if (output->NumElements() > 0) {
+      functor::SelectScalarFunctor<Device, T> func;
+      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
+      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
+           then->flat<T>(), else_->flat<T>());
+    }
+  }
+};
+
+// Specilization for CPU device. Forward input to output depending on the `cond`
+// value.
+// TODO(sjhwang): Consider specializing for GPUDevice as well by using
+// GPUDevice::memcpyDeviceToHost() to fetch bool value.
+template <typename T>
+struct SelectScalarHandler<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const Tensor* cond, const Tensor* then,
+                  const Tensor* else_) {
+    if (cond->scalar<bool>()()) {
+      OP_REQUIRES_OK(ctx, ctx->set_output("output", *then));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->set_output("output", *else_));
+    }
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename Device, typename T>
 struct SelectScalarFunctorBase {
   void operator()(const Device& d, typename TTypes<T>::Flat out,
                   TTypes<bool>::ConstScalar cond,
@@ -217,11 +249,6 @@
   }
 };
 
-// CPU Specializations of Select functors with scalar
-template <typename T>
-struct SelectScalarFunctor<CPUDevice, T>
-    : SelectScalarFunctorBase<CPUDevice, T> {};
-#ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
     : SelectScalarFunctorBase<SYCLDevice, T> {};
@@ -254,9 +281,48 @@
   }
 };
 
+// A fast implementation on CPU, using loop to get rid of broadcasting.
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+struct BatchSelectFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T>::Matrix output_flat_outer_dims,
+                  TTypes<bool>::ConstVec cond_vec,
+                  typename TTypes<T>::ConstMatrix then_flat_outer_dims,
+                  typename TTypes<T>::ConstMatrix else_flat_outer_dims) {
+    const size_t batch = cond_vec.size();
+    const size_t batch_size = then_flat_outer_dims.size() / batch;
+    T* output = output_flat_outer_dims.data();
+    const bool* c = cond_vec.data();
+    const T* t = then_flat_outer_dims.data();
+    const T* e = else_flat_outer_dims.data();
+
+    auto work = [batch_size, output, c, t, e](int64 start, int64 end) {
+      for (size_t i = start; i < end; ++i) {
+        size_t offset = i * batch_size;
+        port::prefetch<port::PREFETCH_HINT_NTA>(
+            reinterpret_cast<const void*>(&t[offset + batch_size]));
+        port::prefetch<port::PREFETCH_HINT_NTA>(
+            reinterpret_cast<const void*>(&e[offset + batch_size]));
+        port::prefetch<port::PREFETCH_HINT_NTA>(
+            reinterpret_cast<const void*>(&c[i + 1]));
+        if (c[i]) {
+          for (size_t j = 0; j < batch_size; ++j) {
+            output[offset + j] = t[offset + j];
+          }
+        } else {
+          for (size_t j = 0; j < batch_size; ++j) {
+            output[offset + j] = e[offset + j];
+          }
+        }
+      }
+    };
+    auto cost = Eigen::TensorOpCost(sizeof(T) * batch_size * 2,  // ld bytes
+                                    sizeof(T) * batch_size,      // st bytes
+                                    batch_size);  // compute cycles
+    d.parallelFor(batch, cost, work);
+  }
 };
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 1b1a704..1014519 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -153,6 +153,27 @@
   };
 };
 
+template <typename T>
+struct unsafe_div_op {
+  EIGEN_EMPTY_STRUCT_CTOR(unsafe_div_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
+                                                           const T& b) const {
+    if (b != 0) {
+      return scalar_quotient_op<T>()(a, b);
+    } else {
+      return 0;
+    }
+  }
+};
+
+template <typename T>
+struct functor_traits<unsafe_div_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = false,
+  };
+};
+
 // scalar_left and scalar_right are template helpers to partially
 // apply a binary function.
 //
@@ -721,6 +742,9 @@
 };
 
 template <typename T>
+struct unsafe_div : base<T, Eigen::internal::unsafe_div_op<T>> {};
+
+template <typename T>
 struct fmod : base<T, Eigen::internal::scalar_fmod_op<T>> {};
 
 template <typename T>
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 965e42d..cfae273 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -17,8 +17,8 @@
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
 
 #define EIGEN_USE_GPU
 
@@ -188,4 +188,4 @@
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index e81b840..15e5de0 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -17,8 +17,8 @@
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
 
 #define EIGEN_USE_GPU
 
@@ -68,4 +68,4 @@
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d2b3c15..607a694 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -251,6 +251,7 @@
 tf_kernel_library(
     name = "generator_dataset_op",
     srcs = ["generator_dataset_op.cc"],
+    hdrs = ["generator_dataset_op.h"],
     deps = [
         ":captured_function",
         "//tensorflow/core:core_cpu_internal",
@@ -343,6 +344,7 @@
 tf_kernel_library(
     name = "prefetch_dataset_op",
     srcs = ["prefetch_dataset_op.cc"],
+    hdrs = ["prefetch_dataset_op.h"],
     deps = [
         ":dataset",
         ":prefetch_autotuner",
@@ -564,6 +566,7 @@
 tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
+    hdrs = ["iterator_ops.h"],
     deps = [
         ":dataset",
         ":dataset_utils",
@@ -659,6 +662,7 @@
         ":iterator_ops",
         ":map_and_batch_dataset_op",
         ":map_dataset_op",
+        ":map_defun_op",
         ":optimize_dataset_op",
         ":optional_ops",
         ":padded_batch_dataset_op",
@@ -701,3 +705,15 @@
         "//tensorflow/core/kernels:ops_util",
     ],
 )
+
+tf_kernel_library(
+    name = "map_defun_op",
+    srcs = ["map_defun_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 58b86f2..f9b5353 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -49,11 +49,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
             const DatasetBase* input)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           batch_size_(batch_size),
           drop_remainder_(drop_remainder),
           input_(input) {
@@ -96,10 +96,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
       Node* drop_remainder = nullptr;
@@ -203,7 +204,7 @@
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
         } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         }
         return Status::OK();
       }
@@ -212,7 +213,7 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 86b0840..6ca0bcd 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -46,11 +46,11 @@
   }
 
  private:
-  class FileDataset : public GraphDatasetBase {
+  class FileDataset : public DatasetBase {
    public:
     explicit FileDataset(OpKernelContext* ctx, const DatasetBase* input,
                          string filename, Env* env)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           filename_(std::move(filename)),
           env_(env),
@@ -85,10 +85,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
       Node* filename = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(filename_, &filename));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph, filename}, output));
@@ -135,7 +136,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("mode"), mode_));
-        return SaveParent(writer, iterator_);
+        return SaveInput(writer, iterator_);
       }
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
@@ -162,7 +163,7 @@
         }
         InitializeIterator();
         TF_RETURN_IF_ERROR(iterator_->Initialize(ctx));
-        return RestoreParent(ctx, reader, iterator_);
+        return RestoreInput(ctx, reader, iterator_);
       }
 
      private:
@@ -269,7 +270,7 @@
             lockfile_ = strings::StrCat(filename_, ".lockfile");
             lockfile_created_ = false;
           }
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("cur_index"), cur_index_));
           TF_RETURN_IF_ERROR(
@@ -285,7 +286,7 @@
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
           int64 temp;
           // TODO(b/78048575): Update this when saving size_t tensors directly
           // is supported.
@@ -538,10 +539,12 @@
     const string tensor_format_string_;
   };  // FileDataset
 
-  class MemoryDataset : public GraphDatasetBase {
+  class MemoryDataset : public DatasetBase {
    public:
     explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input)
-        : GraphDatasetBase(ctx), input_(input), cache_(new MemoryCache()) {
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          cache_(new MemoryCache()) {
       input->Ref();
     }
 
@@ -566,10 +569,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* filename_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(string(""), &filename_node));
       TF_RETURN_IF_ERROR(
@@ -702,7 +706,7 @@
                 writer->WriteScalar(full_name("cache_completed"), ""));
           }
         }
-        return SaveParent(writer, iterator_);
+        return SaveInput(writer, iterator_);
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -748,7 +752,7 @@
         }
         InitializeIterator();
         TF_RETURN_IF_ERROR(iterator_->Initialize(ctx));
-        return RestoreParent(ctx, reader, iterator_);
+        return RestoreInput(ctx, reader, iterator_);
       }
 
      private:
@@ -795,13 +799,13 @@
        protected:
         Status SaveInternal(IteratorStateWriter* writer) override {
           mutex_lock l(mu_);
-          return SaveParent(writer, input_impl_);
+          return SaveInput(writer, input_impl_);
         }
 
         Status RestoreInternal(IteratorContext* ctx,
                                IteratorStateReader* reader) override {
           mutex_lock l(mu_);
-          return RestoreParent(ctx, reader, input_impl_);
+          return RestoreInput(ctx, reader, input_impl_);
         }
 
        private:
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 0012a47..c361a9a 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -39,11 +39,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
                      const DatasetBase* to_concatenate)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           to_concatenate_(to_concatenate) {
       input_->Ref();
@@ -80,13 +80,14 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
       Node* to_concatenate_graph = nullptr;
       TF_RETURN_IF_ERROR(
-          b->AddParentDataset(ctx, to_concatenate_, &to_concatenate_graph));
+          b->AddInputDataset(ctx, to_concatenate_, &to_concatenate_graph));
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {input_graph, to_concatenate_graph}, output));
       return Status::OK();
@@ -132,7 +133,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_uninitialized"), ""));
@@ -157,7 +158,7 @@
           input_impl_.reset();
         }
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 01989a3..c71d027 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -32,7 +32,11 @@
     GraphDefBuilder b;
     DatasetBase::DatasetGraphDefBuilder db(&b);
     Node* input_node = nullptr;
-    OP_REQUIRES_OK(ctx, db.AddParentDataset(ctx, dataset, &input_node));
+    SerializationContext::Params params;
+    params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+    SerializationContext serialization_ctx(params);
+    OP_REQUIRES_OK(
+        ctx, db.AddInputDataset(&serialization_ctx, dataset, &input_node));
     GraphDef graph_def;
     OP_REQUIRES_OK(ctx, b.ToGraphDef(&graph_def));
     Tensor* result;
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index da4b14c..9770bc0 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -76,11 +76,11 @@
  private:
   // TODO(mrry): Push the templated code down to the raw copying routine.
   template <class T>
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 batch_size,
             const PartialTensorShape& row_shape, const DatasetBase* input)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           batch_size_(batch_size),
           row_shape_(row_shape),
           input_(input) {
@@ -115,10 +115,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* batch_size_node;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
       Node* row_shape_node;
@@ -273,14 +274,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(Iterator::SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(Iterator::SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(Iterator::RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(Iterator::RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index 8b29456..ce57739 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -48,12 +48,12 @@
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
 
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const DataTypeVector& output_types,
             std::vector<PartialTensorShape> output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           output_types_(output_types),
           output_shapes_(std::move(output_shapes)) {
@@ -80,10 +80,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
@@ -143,14 +144,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 6d6c445..a80e102 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -79,12 +79,12 @@
  private:
   const int graph_def_version_;
 
-  class FilterDatasetBase : public GraphDatasetBase {
+  class FilterDatasetBase : public DatasetBase {
    public:
     FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
                       const NameAttrList& func,
                       std::unique_ptr<CapturedFunction> captured_func)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)) {
@@ -109,11 +109,12 @@
     string DebugString() const override { return "FilterDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       Node* input_graph_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
@@ -190,7 +191,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         else
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impls_empty"), ""));
@@ -203,7 +204,7 @@
         if (reader->Contains(full_name("input_impls_empty")))
           input_impl_.reset();
         else
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index baca022..07bcb9d 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -56,14 +56,14 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
@@ -91,11 +91,12 @@
     string DebugString() const override { return "FlatMapDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
@@ -174,7 +175,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("element_index"), element_index_));
           if (current_element_iterator_) {
@@ -186,7 +187,7 @@
                   full_name(strings::StrCat("captured_func_inputs[", i, "]")),
                   captured_func_inputs_[i]));
             }
-            TF_RETURN_IF_ERROR(SaveParent(writer, current_element_iterator_));
+            TF_RETURN_IF_ERROR(SaveInput(writer, current_element_iterator_));
           } else {
             TF_RETURN_IF_ERROR(writer->WriteScalar(
                 full_name("current_element_iterator_uninitialized"), ""));
@@ -207,7 +208,7 @@
         if (!reader->Contains(full_name("exhausted"))) {
           TF_RETURN_IF_ERROR(
               dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
           {
             int64 temp;
             TF_RETURN_IF_ERROR(
@@ -233,7 +234,7 @@
             element_index_--;
             TF_RETURN_IF_ERROR(BuildCurrentElementIteratorLocked(ctx));
             TF_RETURN_IF_ERROR(
-                RestoreParent(ctx, reader, current_element_iterator_));
+                RestoreInput(ctx, reader, current_element_iterator_));
           }
         }
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 0981e42..3c3d78b 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -15,192 +15,183 @@
 #include <iterator>
 #include <vector>
 
-#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/generator_dataset_op.h"
+
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
-namespace {
-
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class GeneratorDatasetOp : public DatasetOpKernel {
+class GeneratorDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit GeneratorDatasetOp(OpKernelConstruction* ctx)
-      : DatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("next_func", &next_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  Dataset(OpKernelContext* ctx, std::unique_ptr<CapturedFunction> init_func,
+          std::unique_ptr<CapturedFunction> next_func,
+          std::unique_ptr<CapturedFunction> finalize_func,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        init_func_(std::move(init_func)),
+        next_func_(std::move(next_func)),
+        finalize_func_(std::move(finalize_func)),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {}
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(
+        new Iterator({this, strings::StrCat(prefix, "::Generator")}));
   }
 
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    OpInputList init_func_other_args_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("init_func_other_args",
-                                        &init_func_other_args_input));
-    std::vector<Tensor> init_func_other_args;
-    init_func_other_args.reserve(init_func_other_args_input.size());
-    for (const Tensor& t : init_func_other_args_input) {
-      init_func_other_args.push_back(t);
-    }
-    std::unique_ptr<CapturedFunction> init_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(
-                 init_func_, std::move(init_func_other_args), &init_func));
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
 
-    OpInputList next_func_other_args_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("next_func_other_args",
-                                        &next_func_other_args_input));
-    std::vector<Tensor> next_func_other_args;
-    next_func_other_args.reserve(next_func_other_args_input.size());
-    for (const Tensor& t : next_func_other_args_input) {
-      next_func_other_args.push_back(t);
-    }
-    std::unique_ptr<CapturedFunction> next_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(
-                 next_func_, std::move(next_func_other_args), &next_func));
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
 
-    OpInputList finalize_func_other_args_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("finalize_func_other_args",
-                                        &finalize_func_other_args_input));
-    std::vector<Tensor> finalize_func_other_args;
-    finalize_func_other_args.reserve(finalize_func_other_args_input.size());
-    for (const Tensor& t : finalize_func_other_args_input) {
-      finalize_func_other_args.push_back(t);
-    }
-    std::unique_ptr<CapturedFunction> finalize_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            finalize_func_, std::move(finalize_func_other_args),
-                            &finalize_func));
+  string DebugString() const override { return "GeneratorDatasetOp::Dataset"; }
 
-    *output =
-        new Dataset(ctx, std::move(init_func), std::move(next_func),
-                    std::move(finalize_func), output_types_, output_shapes_);
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    return errors::Unimplemented("%s does not support serialization",
+                                 DebugString());
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, std::unique_ptr<CapturedFunction> init_func,
-            std::unique_ptr<CapturedFunction> next_func,
-            std::unique_ptr<CapturedFunction> finalize_func,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
-          init_func_(std::move(init_func)),
-          next_func_(std::move(next_func)),
-          finalize_func_(std::move(finalize_func)),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {}
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
 
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Generator")}));
+    ~Iterator() override {
+      if (!finalized_) {
+        std::vector<Tensor> ignored;
+        Status s = dataset()->finalize_func_->RunInstantiated(state_, &ignored);
+        if (!s.ok()) {
+          LOG(WARNING)
+              << "Error occurred when finalizing GeneratorDataset iterator: "
+              << s;
+        }
+      }
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      mutex_lock l(mu_);
 
-    string DebugString() const override {
-      return "GeneratorDatasetOp::Dataset";
+      if (!initialized_) {
+        TF_RETURN_IF_ERROR(
+            dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+        // Explicitly instantiate the finalize function here so that
+        // we can invoke it in the destructor.
+        TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+        initialized_ = true;
+      }
+
+      if (finalized_) {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+      Status s =
+          dataset()->next_func_->RunWithBorrowedArgs(ctx, state_, out_tensors);
+      if (s.ok()) {
+        *end_of_sequence = false;
+      } else if (errors::IsOutOfRange(s)) {
+        // `next_func` may deliberately raise `errors::OutOfRange`
+        // to indicate that we should terminate the iteration.
+        s = Status::OK();
+        *end_of_sequence = true;
+
+        // NOTE(mrry): We ignore any tensors returned by the
+        // finalize function.
+        std::vector<Tensor> ignored;
+        TF_RETURN_IF_ERROR(
+            dataset()->finalize_func_->RunInstantiated(state_, &ignored));
+        finalized_ = true;
+      }
+      return s;
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      ~Iterator() override {
-        if (!finalized_) {
-          std::vector<Tensor> ignored;
-          Status s =
-              dataset()->finalize_func_->RunInstantiated(state_, &ignored);
-          if (!s.ok()) {
-            LOG(WARNING)
-                << "Error occurred when finalizing GeneratorDataset iterator: "
-                << s;
-          }
-        }
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-
-        if (!initialized_) {
-          TF_RETURN_IF_ERROR(
-              dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
-          // Explicitly instantiate the finalize function here so that
-          // we can invoke it in the destructor.
-          TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
-          initialized_ = true;
-        }
-
-        if (finalized_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-
-        Status s = dataset()->next_func_->RunWithBorrowedArgs(ctx, state_,
-                                                              out_tensors);
-        if (s.ok()) {
-          *end_of_sequence = false;
-        } else if (errors::IsOutOfRange(s)) {
-          // `next_func` may deliberately raise `errors::OutOfRange`
-          // to indicate that we should terminate the iteration.
-          s = Status::OK();
-          *end_of_sequence = true;
-
-          // NOTE(mrry): We ignore any tensors returned by the
-          // finalize function.
-          std::vector<Tensor> ignored;
-          TF_RETURN_IF_ERROR(
-              dataset()->finalize_func_->RunInstantiated(state_, &ignored));
-          finalized_ = true;
-        }
-        return s;
-      }
-
-     private:
-      mutex mu_;
-      bool initialized_ GUARDED_BY(mu_) = false;
-      bool finalized_ GUARDED_BY(mu_) = false;
-      std::vector<Tensor> state_ GUARDED_BY(mu_);
-    };
-
-    const std::unique_ptr<CapturedFunction> init_func_;
-    const std::unique_ptr<CapturedFunction> next_func_;
-    const std::unique_ptr<CapturedFunction> finalize_func_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
+    mutex mu_;
+    bool initialized_ GUARDED_BY(mu_) = false;
+    bool finalized_ GUARDED_BY(mu_) = false;
+    std::vector<Tensor> state_ GUARDED_BY(mu_);
   };
 
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList init_func_;
-  NameAttrList next_func_;
-  NameAttrList finalize_func_;
+  const std::unique_ptr<CapturedFunction> init_func_;
+  const std::unique_ptr<CapturedFunction> next_func_;
+  const std::unique_ptr<CapturedFunction> finalize_func_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
 };
 
+GeneratorDatasetOp::GeneratorDatasetOp(OpKernelConstruction* ctx)
+    : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("next_func", &next_func_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+}
+
+void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                     DatasetBase** output) {
+  OpInputList init_func_other_args_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("init_func_other_args",
+                                      &init_func_other_args_input));
+  std::vector<Tensor> init_func_other_args;
+  init_func_other_args.reserve(init_func_other_args_input.size());
+  for (const Tensor& t : init_func_other_args_input) {
+    init_func_other_args.push_back(t);
+  }
+  std::unique_ptr<CapturedFunction> init_func;
+  OP_REQUIRES_OK(
+      ctx, CapturedFunction::Create(init_func_, std::move(init_func_other_args),
+                                    &init_func));
+
+  OpInputList next_func_other_args_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("next_func_other_args",
+                                      &next_func_other_args_input));
+  std::vector<Tensor> next_func_other_args;
+  next_func_other_args.reserve(next_func_other_args_input.size());
+  for (const Tensor& t : next_func_other_args_input) {
+    next_func_other_args.push_back(t);
+  }
+  std::unique_ptr<CapturedFunction> next_func;
+  OP_REQUIRES_OK(
+      ctx, CapturedFunction::Create(next_func_, std::move(next_func_other_args),
+                                    &next_func));
+
+  OpInputList finalize_func_other_args_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("finalize_func_other_args",
+                                      &finalize_func_other_args_input));
+  std::vector<Tensor> finalize_func_other_args;
+  finalize_func_other_args.reserve(finalize_func_other_args_input.size());
+  for (const Tensor& t : finalize_func_other_args_input) {
+    finalize_func_other_args.push_back(t);
+  }
+  std::unique_ptr<CapturedFunction> finalize_func;
+  OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                          finalize_func_, std::move(finalize_func_other_args),
+                          &finalize_func));
+
+  *output =
+      new Dataset(ctx, std::move(init_func), std::move(next_func),
+                  std::move(finalize_func), output_types_, output_shapes_);
+}
+
 REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU),
                         GeneratorDatasetOp);
 REGISTER_KERNEL_BUILDER(
     Name("GeneratorDataset").Device(DEVICE_GPU).HostMemory("handle"),
     GeneratorDatasetOp);
 
-}  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.h b/tensorflow/core/kernels/data/generator_dataset_op.h
new file mode 100644
index 0000000..3f84fa9
--- /dev/null
+++ b/tensorflow/core/kernels/data/generator_dataset_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+
+namespace tensorflow {
+
+class GeneratorDatasetOp : public DatasetOpKernel {
+ public:
+  explicit GeneratorDatasetOp(OpKernelConstruction* ctx);
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList init_func_;
+  NameAttrList next_func_;
+  NameAttrList finalize_func_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
index 7206be8..be4132a 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -66,7 +66,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             std::unique_ptr<CapturedFunction> captured_key_func,
@@ -75,7 +75,7 @@
             std::unique_ptr<CapturedFunction> captured_finalize_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           captured_key_func_(std::move(captured_key_func)),
           captured_init_func_(std::move(captured_init_func)),
@@ -106,14 +106,16 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func().name()));
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, init_func().name()));
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func().name()));
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, finalize_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), key_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), init_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), reduce_func().name()));
+      TF_RETURN_IF_ERROR(
+          b->AddFunction(ctx->flib_def(), finalize_func().name()));
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       std::vector<Node*> key_func_other_arguments_node;
       DataTypeVector key_func_other_arguments_types;
@@ -261,7 +263,7 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
 
         if (end_of_input_) {
           TF_RETURN_IF_ERROR(
@@ -311,7 +313,7 @@
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
 
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
 
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 23d769e..288695f 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -93,7 +93,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& key_func, const NameAttrList& reduce_func,
@@ -103,7 +103,7 @@
             std::unique_ptr<CapturedFunction> captured_window_size_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           key_func_(key_func),
           reduce_func_(reduce_func),
@@ -136,13 +136,15 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func_.name()));
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func_.name()));
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, window_size_func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), key_func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), reduce_func_.name()));
+      TF_RETURN_IF_ERROR(
+          b->AddFunction(ctx->flib_def(), window_size_func_.name()));
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       std::vector<Node*> key_func_other_arguments_node;
       DataTypeVector key_func_other_arguments_types;
@@ -307,7 +309,7 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
 
         if (end_of_input_) {
           TF_RETURN_IF_ERROR(
@@ -348,7 +350,7 @@
         }
 
         if (current_group_iterator_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, current_group_iterator_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, current_group_iterator_));
 
           // Saving current_key_
           TF_RETURN_IF_ERROR(
@@ -364,7 +366,7 @@
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
 
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
 
@@ -412,7 +414,7 @@
           TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, current_key_));
           // Restore current_group_iterator_ state
           TF_RETURN_IF_ERROR(
-              RestoreParent(ctx, reader, current_group_iterator_));
+              RestoreInput(ctx, reader, current_group_iterator_));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 0765e63..58b79d6 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -76,14 +76,14 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
@@ -114,11 +114,12 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* cycle_length_node;
       TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
       Node* block_length_node;
@@ -217,7 +218,7 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("cycle_index"), cycle_index_));
         TF_RETURN_IF_ERROR(
@@ -235,7 +236,7 @@
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         int64 cycle_index;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("cycle_index"), &cycle_index));
@@ -256,7 +257,7 @@
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (current_elements_[idx]) {
-            TF_RETURN_IF_ERROR(SaveParent(writer, current_elements_[idx]));
+            TF_RETURN_IF_ERROR(SaveInput(writer, current_elements_[idx]));
             TF_RETURN_IF_ERROR(writer->WriteScalar(
                 full_name(strings::StrCat("args_size[", idx, "]")),
                 args_list_[idx].size()));
@@ -290,7 +291,7 @@
                 ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
-                RestoreParent(ctx, reader, current_elements_[idx]));
+                RestoreInput(ctx, reader, current_elements_[idx]));
           } else {
             current_elements_[idx].reset();
           }
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 86adbc4..61a6c06 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -12,7 +12,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
@@ -23,7 +24,6 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -81,6 +81,8 @@
   return Status::OK();
 }
 
+}  // namespace
+
 class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
@@ -114,7 +116,7 @@
     }
   }
 
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+  Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
       return captured_iterator->Save(ctx, writer);
@@ -128,7 +130,7 @@
 
   Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
     string serialized_graph_def;
-    TF_RETURN_IF_ERROR(reader->ReadScalar(GraphDatasetBase::kDatasetGraphKey,
+    TF_RETURN_IF_ERROR(reader->ReadScalar(DatasetBase::kDatasetGraphKey,
                                           &serialized_graph_def));
     GraphDef graph_def;
     if (!graph_def.ParseFromString(serialized_graph_def)) {
@@ -136,7 +138,7 @@
     }
     string output_node;
     TF_RETURN_IF_ERROR(reader->ReadScalar(
-        GraphDatasetBase::kDatasetGraphOutputNodeKey, &output_node));
+        DatasetBase::kDatasetGraphOutputNodeKey, &output_node));
     DatasetBase* dataset = nullptr;
     Graph graph(OpRegistry::Global());
     TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
@@ -159,9 +161,9 @@
         graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
     std::unique_ptr<IteratorBase> iterator;
-    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iterator));
     TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
@@ -384,10 +386,13 @@
   // that it can be written on the next call to Encode().
   Status InitializeFromIterator(OpKernelContext* ctx,
                                 IteratorResource* iterator_resource) {
+    SerializationContext::Params params;
+    params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+    SerializationContext serialization_ctx(params);
     data_.reset(new VariantTensorData());
     data_->set_type_name(TypeName());
     VariantTensorDataWriter writer(data_.get());
-    TF_RETURN_IF_ERROR(iterator_resource->Save(ctx, &writer));
+    TF_RETURN_IF_ERROR(iterator_resource->Save(&serialization_ctx, &writer));
     TF_RETURN_IF_ERROR(writer.Flush());
     return Status::OK();
   }
@@ -438,300 +443,179 @@
 // Note that IteratorHandleOp holds a reference to the resource it creates. If
 // cleaning up resources with DestroyResourceOp is important, consider creating
 // resource containers with AnonymousIteratorHandleOp instead.
-class IteratorHandleOp : public OpKernel {
- public:
-  explicit IteratorHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-  }
+IteratorHandleOp::IteratorHandleOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
+}
 
-  // The resource is deleted from the resource manager only when it is private
-  // to kernel. Ideally the resource should be deleted when it is no longer held
-  // by anyone, but it would break backward compatibility.
-  ~IteratorHandleOp() override {
-    if (resource_ != nullptr) {
-      resource_->Unref();
-      if (cinfo_.resource_is_private_to_kernel()) {
-        if (!cinfo_.resource_manager()
-                 ->template Delete<IteratorResource>(cinfo_.container(),
-                                                     cinfo_.name())
-                 .ok()) {
-          // Do nothing; the resource can have been deleted by session resets.
-        }
+// The resource is deleted from the resource manager only when it is private
+// to kernel. Ideally the resource should be deleted when it is no longer held
+// by anyone, but it would break backward compatibility.
+IteratorHandleOp::~IteratorHandleOp() {
+  if (resource_ != nullptr) {
+    resource_->Unref();
+    if (cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<IteratorResource>(cinfo_.container(),
+                                                   cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
       }
     }
   }
+}
 
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    {
-      mutex_lock l(mu_);
-      if (resource_ == nullptr) {
-        FunctionLibraryRuntime* lib;
-        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-        // If the iterator is shared then we construct a new FLR, and pass that
-        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
-        // functions from the iterator. We may add this functionality if there
-        // is sufficient demand, but it will require a significant refactoring.
-        if (!name_.empty()) {
-          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
-        } else {
-          OP_REQUIRES_OK(context, context->function_library()->Clone(
-                                      &flib_def, &pflr, &lib));
-        }
-
-        ResourceMgr* mgr = context->resource_manager();
-        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
-
-        IteratorResource* resource;
-        OP_REQUIRES_OK(
-            context,
-            mgr->LookupOrCreate<IteratorResource>(
-                cinfo_.container(), cinfo_.name(), &resource,
-                [lib, &device_mgr, &flib_def, &pflr,
-                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                  *ret = new IteratorResource(
-                      output_dtypes_, output_shapes_, graph_def_version_,
-                      std::move(device_mgr), std::move(flib_def),
-                      std::move(pflr), lib);
-                  return Status::OK();
-                }));
-
-        Status s = VerifyResource(resource);
-        if (TF_PREDICT_FALSE(!s.ok())) {
-          resource->Unref();
-          context->SetStatus(s);
-          return;
-        }
-
-        resource_ = resource;
+void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+  {
+    mutex_lock l(mu_);
+    if (resource_ == nullptr) {
+      FunctionLibraryRuntime* lib;
+      std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+      std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+      // If the iterator is shared then we construct a new FLR, and pass that
+      // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+      // functions from the iterator. We may add this functionality if there
+      // is sufficient demand, but it will require a significant refactoring.
+      if (!name_.empty()) {
+        lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+      } else {
+        OP_REQUIRES_OK(context, context->function_library()->Clone(
+                                    &flib_def, &pflr, &lib));
       }
+
+      ResourceMgr* mgr = context->resource_manager();
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+      IteratorResource* resource;
+      OP_REQUIRES_OK(
+          context,
+          mgr->LookupOrCreate<IteratorResource>(
+              cinfo_.container(), cinfo_.name(), &resource,
+              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
+                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                    *ret = new IteratorResource(
+                        output_dtypes_, output_shapes_, graph_def_version_,
+                        std::move(device_mgr), std::move(flib_def),
+                        std::move(pflr), lib);
+                    return Status::OK();
+                  }));
+
+      Status s = VerifyResource(resource);
+      if (TF_PREDICT_FALSE(!s.ok())) {
+        resource->Unref();
+        context->SetStatus(s);
+        return;
+      }
+
+      resource_ = resource;
     }
-    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, cinfo_.container(), cinfo_.name(),
-                                MakeTypeIndex<IteratorResource>()));
   }
+  OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                              context, 0, cinfo_.container(), cinfo_.name(),
+                              MakeTypeIndex<IteratorResource>()));
+}
 
- private:
-  // During the first Compute(), resource is either created or looked up using
-  // shared_name. In the latter case, the resource found should be verified if
-  // it is compatible with this op's configuration. The verification may fail in
-  // cases such as two graphs asking queues of the same shared name to have
-  // inconsistent capacities.
-  Status VerifyResource(IteratorResource* resource) {
-    TF_RETURN_IF_ERROR(
-        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
-    TF_RETURN_IF_ERROR(
-        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-    return Status::OK();
-  }
+Status IteratorHandleOp::VerifyResource(IteratorResource* resource) {
+  TF_RETURN_IF_ERROR(
+      VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
+  TF_RETURN_IF_ERROR(
+      VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+  return Status::OK();
+}
 
-  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
-  static inline To down_cast(From* f) {  // so we only accept pointers
-    static_assert(
-        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
-        "target type not derived from source type");
+FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
+    OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
+    std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
+  // Wrap the existing device in order to see any captured resources
+  // in its resource manager. The existing device will outlive the
+  // IteratorResource, because we are storing the IteratorResource
+  // in that device's resource manager.
+  Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+      ctx->device()->name(), down_cast<Device*>(ctx->device()),
+      false /* owns_underlying */, false /* isolate_session_state */);
+  device_mgr->reset(new DeviceMgr({wrapped_device}));
+  flib_def->reset(new FunctionLibraryDefinition(
+      *ctx->function_library()->GetFunctionLibraryDefinition()));
+  pflr->reset(new ProcessFunctionLibraryRuntime(
+      device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
+      {} /* TODO(mrry): OptimizerOptions? */,
+      nullptr /* TODO(mrry): ClusterFLR */));
 
-    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
-#if !defined(__GNUC__) || defined(__GXX_RTTI)
-    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
-    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
-#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
-    return static_cast<To>(f);
-  }
-
-  FunctionLibraryRuntime* CreatePrivateFLR(
-      OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
-      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-      std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
-    // Wrap the existing device in order to see any captured resources
-    // in its resource manager. The existing device will outlive the
-    // IteratorResource, because we are storing the IteratorResource
-    // in that device's resource manager.
-    Device* wrapped_device = RenamedDevice::NewRenamedDevice(
-        ctx->device()->name(), down_cast<Device*>(ctx->device()),
-        false /* owns_underlying */, false /* isolate_session_state */);
-    device_mgr->reset(new DeviceMgr({wrapped_device}));
-    flib_def->reset(new FunctionLibraryDefinition(
-        *ctx->function_library()->GetFunctionLibraryDefinition()));
-    pflr->reset(new ProcessFunctionLibraryRuntime(
-        device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
-        {} /* TODO(mrry): OptimizerOptions? */,
-        nullptr /* TODO(mrry): ClusterFLR */));
-
-    return (*pflr)->GetFLR(ctx->device()->name());
-  }
-
-  mutex mu_;
-  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
-  IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
-  string name_;
-};
+  return (*pflr)->GetFLR(ctx->device()->name());
+}
 
 // Like IteratorHandleOp, but creates handles which are never shared, and does
 // not hold a reference to these handles. The latter is important for eager
 // execution, since OpKernel instances generally live as long as the program
 // running them.
-class AnonymousIteratorHandleOp : public OpKernel {
- public:
-  explicit AnonymousIteratorHandleOp(OpKernelConstruction* context)
-      : OpKernel(context), graph_def_version_(context->graph_def_version()) {
-    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(context, context->GetAttr("output_shapes", &output_shapes_));
-  }
+AnonymousIteratorHandleOp::AnonymousIteratorHandleOp(
+    OpKernelConstruction* context)
+    : OpKernel(context), graph_def_version_(context->graph_def_version()) {
+  OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_dtypes_));
+  OP_REQUIRES_OK(context, context->GetAttr("output_shapes", &output_shapes_));
+}
 
-  void Compute(OpKernelContext* context) override {
-    FunctionLibraryRuntime* lib;
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    OP_REQUIRES_OK(context,
-                   context->function_library()->Clone(&flib_def, &pflr, &lib));
+void AnonymousIteratorHandleOp::Compute(OpKernelContext* context) {
+  FunctionLibraryRuntime* lib;
+  std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+  std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+  OP_REQUIRES_OK(context,
+                 context->function_library()->Clone(&flib_def, &pflr, &lib));
 
-    ResourceMgr* mgr = context->resource_manager();
+  ResourceMgr* mgr = context->resource_manager();
 
-    const string container_name = "AnonymousIterator";
-    string unique_name;
-    {
-      mutex_lock l(static_resource_lookup_mutex_);
-      while (true) {  // Find an unused name
-        IteratorResource* existing_resource = nullptr;
-        unique_name = strings::StrCat("AnonymousIterator", current_id_++);
-        Status status = mgr->Lookup<IteratorResource>(
-            container_name, unique_name, &existing_resource);
-        if (status.code() == error::NOT_FOUND) {
-          break;
-        }
-        OP_REQUIRES_OK(context, status);
-        existing_resource->Unref();
+  const string container_name = "AnonymousIterator";
+  string unique_name;
+  {
+    mutex_lock l(static_resource_lookup_mutex_);
+    while (true) {  // Find an unused name
+      IteratorResource* existing_resource = nullptr;
+      unique_name = strings::StrCat("AnonymousIterator", current_id_++);
+      Status status = mgr->Lookup<IteratorResource>(container_name, unique_name,
+                                                    &existing_resource);
+      if (status.code() == error::NOT_FOUND) {
+        break;
       }
-      IteratorResource* new_resource = new IteratorResource(
-          output_dtypes_, output_shapes_, graph_def_version_,
-          std::move(device_mgr), std::move(flib_def), std::move(pflr), lib);
-      // Create the resource with our chosen name under the resource lookup
-      // mutex to avoid another kernel racily creating a resource with this
-      // name.
-      OP_REQUIRES_OK(context, mgr->Create<IteratorResource>(
-                                  container_name, unique_name, new_resource));
+      OP_REQUIRES_OK(context, status);
+      existing_resource->Unref();
     }
-    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, container_name, unique_name,
-                                MakeTypeIndex<IteratorResource>()));
+    IteratorResource* new_resource = new IteratorResource(
+        output_dtypes_, output_shapes_, graph_def_version_,
+        std::move(device_mgr), std::move(flib_def), std::move(pflr), lib);
+    // Create the resource with our chosen name under the resource lookup
+    // mutex to avoid another kernel racily creating a resource with this
+    // name.
+    OP_REQUIRES_OK(context, mgr->Create<IteratorResource>(
+                                container_name, unique_name, new_resource));
   }
-
- private:
-  // Coordinates Iterator unique name creation across AnonymousIteratorHandleOp
-  // instances.
-  static mutex static_resource_lookup_mutex_;
-  // current_id_ is just a hint for creating unique names. If it turns out
-  // there's a collision (e.g. because another AnonymousIteratorHandleOp
-  // instance is generating handles) we'll just skip that id.
-  static int64 current_id_ GUARDED_BY(static_resource_lookup_mutex_);
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
-};
+  OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                              context, 0, container_name, unique_name,
+                              MakeTypeIndex<IteratorResource>()));
+}
 
 // Static initializers for AnonymousIteratorHandleOp id counting.
 mutex AnonymousIteratorHandleOp::static_resource_lookup_mutex_{
     LINKER_INITIALIZED};
 int64 AnonymousIteratorHandleOp::current_id_(0);
 
-class MakeIteratorOp : public OpKernel {
- public:
-  explicit MakeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+void MakeIteratorOp::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+  IteratorResource* iterator_resource;
+  OP_REQUIRES_OK(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
+  core::ScopedUnref unref(iterator_resource);
 
-  void Compute(OpKernelContext* ctx) override {
-    DatasetBase* dataset;
-    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
-    core::ScopedUnref unref(iterator_resource);
-
-    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
-    std::unique_ptr<IteratorBase> iterator;
-    OP_REQUIRES_OK(ctx,
-                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
-    OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
-  }
-};
-
-// A simple background worker that executes closures asynchronously and without
-// blocking.
-//
-// A `BackgroundWorker` is used to offload blocking work from an `AsyncOpKernel`
-// to avoid blocking an executor thread that may be required by the blocking
-// work.
-//
-// NOTE(mrry): We do not use a regular `tensorflow::thread::ThreadPool` for this
-// purpose because its current implementation (in Eigen) uses a finite-length
-// queue and will block the caller when full. This can lead to deadlock under
-// heavy load. Since the number of concurrent work items in each user of a
-// `BackgroundWorker` is at most one per op invocation, the dynamic allocation
-// overhead is tolerable.
-class BackgroundWorker {
- public:
-  BackgroundWorker(Env* env, const string& name) {
-    thread_.reset(env->StartThread({} /* thread_options */, name,
-                                   [this]() { WorkerLoop(); }));
-  }
-
-  ~BackgroundWorker() {
-    {
-      mutex_lock l(mu_);
-      cancelled_ = true;
-    }
-    cond_var_.notify_one();
-    // Block until the background thread has terminated.
-    //
-    // NOTE(mrry): We explicitly free and join the thread here because
-    // `WorkerLoop()` uses other members of this object, and so we must join
-    // the thread before destroying them.
-    thread_.reset();
-  }
-
-  void Schedule(std::function<void()> work_item) {
-    {
-      mutex_lock l(mu_);
-      work_queue_.push_back(std::move(work_item));
-    }
-    cond_var_.notify_one();
-  }
-
- private:
-  void WorkerLoop() {
-    while (true) {
-      std::function<void()> work_item = nullptr;
-      {
-        mutex_lock l(mu_);
-        while (!cancelled_ && work_queue_.empty()) {
-          cond_var_.wait(l);
-        }
-        if (cancelled_) {
-          return;
-        }
-        DCHECK(!work_queue_.empty());
-        work_item = std::move(work_queue_.front());
-        work_queue_.pop_front();
-      }
-      DCHECK(work_item != nullptr);
-      work_item();
-    }
-  }
-
-  std::unique_ptr<Thread> thread_;
-  mutex mu_;
-  condition_variable cond_var_;
-  bool cancelled_ GUARDED_BY(mu_) = false;
-  std::deque<std::function<void()>> work_queue_ GUARDED_BY(mu_);
-};
+  std::unique_ptr<IteratorBase> iterator;
+  OP_REQUIRES_OK(
+      ctx, dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iterator));
+  OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
+}
 
 class ToSingleElementOp : public AsyncOpKernel {
  public:
@@ -749,11 +633,11 @@
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
+          dataset->MakeIterator(IteratorContext(ctx), "SingleElementIterator",
+                                &iterator),
           done);
 
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
@@ -767,8 +651,8 @@
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence = false;
 
-      Status s =
-          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+      Status s = raw_iterator->GetNext(IteratorContext(ctx), &components,
+                                       &end_of_sequence);
       if (!s.ok()) {
         ctx->SetStatus(s);
         return;
@@ -783,8 +667,8 @@
       }
 
       components.clear();
-      Status s2 =
-          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+      Status s2 = raw_iterator->GetNext(IteratorContext(ctx), &components,
+                                        &end_of_sequence);
       if (!s2.ok()) {
         ctx->SetStatus(s2);
         return;
@@ -952,9 +836,9 @@
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
     std::unique_ptr<IteratorBase> iter;
-    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iter));
+    TF_RETURN_IF_ERROR(
+        dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iter));
     TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
 
     (*iterator)->Ref();
@@ -996,60 +880,47 @@
   const int graph_def_version_;
 };
 
-class IteratorGetNextOp : public AsyncOpKernel {
- public:
-  explicit IteratorGetNextOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           strings::StrCat("iterator_get_next_thread_",
-                                           SanitizeThreadSuffix(name()))) {}
+void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  IteratorResource* iterator;
+  OP_REQUIRES_OK_ASYNC(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+  // The call to `iterator->GetNext()` may block and depend on an
+  // inter-op thread pool thread, so we issue the call from the
+  // owned thread pool.
+  background_worker_.Schedule(std::bind(
+      [ctx, iterator](DoneCallback done) {
+        std::vector<Tensor> components;
+        bool end_of_sequence = false;
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    IteratorResource* iterator;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    // The call to `iterator->GetNext()` may block and depend on an
-    // inter-op thread pool thread, so we issue the call from the
-    // owned thread pool.
-    background_worker_.Schedule(std::bind(
-        [ctx, iterator](DoneCallback done) {
-          std::vector<Tensor> components;
-          bool end_of_sequence = false;
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        params.function_library = iterator->function_library();
+        DeviceBase* device = ctx->function_library()->device();
+        params.allocator_getter = [device](AllocatorAttributes attrs) {
+          return device->GetAllocator(attrs);
+        };
+        IteratorContext iter_ctx(std::move(params));
 
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
+        Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+        // NOTE(mrry): We must unref the iterator before calling `done()`, to
+        // avoid destruction races.
+        iterator->Unref();
 
-          Status s =
-              iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
-          // NOTE(mrry): We must unref the iterator before calling `done()`, to
-          // avoid destruction races.
-          iterator->Unref();
-
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (end_of_sequence) {
-            ctx->SetStatus(errors::OutOfRange("End of sequence"));
-          } else {
-            for (int i = 0; i < components.size(); ++i) {
-              // TODO(mrry): Check that the shapes match the shape attrs.
-              ctx->set_output(i, components[i]);
-            }
+        if (!s.ok()) {
+          ctx->SetStatus(s);
+        } else if (end_of_sequence) {
+          ctx->SetStatus(errors::OutOfRange("End of sequence"));
+        } else {
+          for (int i = 0; i < components.size(); ++i) {
+            // TODO(mrry): Check that the shapes match the shape attrs.
+            ctx->set_output(i, components[i]);
           }
-          done();
-        },
-        std::move(done)));
-  }
-
- private:
-  BackgroundWorker background_worker_;
-};
+        }
+        done();
+      },
+      std::move(done)));
+}
 
 class IteratorGetNextSyncOp : public OpKernel {
  public:
@@ -1165,90 +1036,76 @@
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-class IteratorToStringHandleOp : public OpKernel {
- public:
-  explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
+void IteratorToStringHandleOp::Compute(OpKernelContext* ctx) {
+  const Tensor& resource_handle_t = ctx->input(0);
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+              errors::InvalidArgument("resource_handle must be a scalar"));
 
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& resource_handle_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
-                errors::InvalidArgument("resource_handle must be a scalar"));
+  // Validate that the handle corresponds to a real resource, and
+  // that it is an IteratorResource.
+  IteratorResource* iterator_resource;
+  OP_REQUIRES_OK(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
+  iterator_resource->Unref();
 
-    // Validate that the handle corresponds to a real resource, and
-    // that it is an IteratorResource.
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-    iterator_resource->Unref();
+  Tensor* string_handle_t;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output(0, TensorShape({}), &string_handle_t));
+  string_handle_t->scalar<string>()() =
+      resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
+}
 
-    Tensor* string_handle_t;
+IteratorFromStringHandleOp::IteratorFromStringHandleOp(
+    OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  OP_REQUIRES(
+      ctx,
+      output_dtypes_.empty() || output_shapes_.empty() ||
+          output_dtypes_.size() == output_shapes_.size(),
+      errors::InvalidArgument("If both 'output_types' and 'output_shapes' "
+                              "are set, they must have the same length."));
+}
+
+void IteratorFromStringHandleOp::Compute(OpKernelContext* ctx) {
+  const Tensor& string_handle_t = ctx->input(0);
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()),
+              errors::InvalidArgument("string_handle must be a scalar"));
+
+  ResourceHandle resource_handle;
+  OP_REQUIRES(
+      ctx, resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+      errors::InvalidArgument(
+          "Could not parse string_handle as a valid ResourceHandle"));
+
+  OP_REQUIRES(
+      ctx, resource_handle.device() == ctx->device()->attributes().name(),
+      errors::InvalidArgument("Attempted create an iterator on device \"",
+                              ctx->device()->attributes().name(),
+                              "\" from handle defined on device \"",
+                              resource_handle.device(), "\""));
+
+  // Validate that the handle corresponds to a real resource, and
+  // that it is an IteratorResource.
+  IteratorResource* iterator_resource;
+  OP_REQUIRES_OK(ctx, LookupResource(ctx, resource_handle, &iterator_resource));
+  core::ScopedUnref unref_iterator(iterator_resource);
+  if (!output_dtypes_.empty()) {
+    OP_REQUIRES_OK(ctx, VerifyTypesMatch(output_dtypes_,
+                                         iterator_resource->output_dtypes()));
+  }
+  if (!output_shapes_.empty()) {
     OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(0, TensorShape({}), &string_handle_t));
-    string_handle_t->scalar<string>()() =
-        resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
-  }
-};
-
-class IteratorFromStringHandleOp : public OpKernel {
- public:
-  explicit IteratorFromStringHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES(
-        ctx,
-        output_dtypes_.empty() || output_shapes_.empty() ||
-            output_dtypes_.size() == output_shapes_.size(),
-        errors::InvalidArgument("If both 'output_types' and 'output_shapes' "
-                                "are set, they must have the same length."));
+                   VerifyShapesCompatible(output_shapes_,
+                                          iterator_resource->output_shapes()));
   }
 
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& string_handle_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()),
-                errors::InvalidArgument("string_handle must be a scalar"));
-
-    ResourceHandle resource_handle;
-    OP_REQUIRES(
-        ctx,
-        resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
-        errors::InvalidArgument(
-            "Could not parse string_handle as a valid ResourceHandle"));
-
-    OP_REQUIRES(
-        ctx, resource_handle.device() == ctx->device()->attributes().name(),
-        errors::InvalidArgument("Attempted create an iterator on device \"",
-                                ctx->device()->attributes().name(),
-                                "\" from handle defined on device \"",
-                                resource_handle.device(), "\""));
-
-    // Validate that the handle corresponds to a real resource, and
-    // that it is an IteratorResource.
-    IteratorResource* iterator_resource;
-    OP_REQUIRES_OK(ctx,
-                   LookupResource(ctx, resource_handle, &iterator_resource));
-    core::ScopedUnref unref_iterator(iterator_resource);
-    if (!output_dtypes_.empty()) {
-      OP_REQUIRES_OK(ctx, VerifyTypesMatch(output_dtypes_,
-                                           iterator_resource->output_dtypes()));
-    }
-    if (!output_shapes_.empty()) {
-      OP_REQUIRES_OK(
-          ctx, VerifyShapesCompatible(output_shapes_,
-                                      iterator_resource->output_shapes()));
-    }
-
-    Tensor* resource_handle_t;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output(0, TensorShape({}), &resource_handle_t));
-    resource_handle_t->scalar<ResourceHandle>()() = resource_handle;
-  }
-
- private:
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
+  Tensor* resource_handle_t;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output(0, TensorShape({}), &resource_handle_t));
+  resource_handle_t->scalar<ResourceHandle>()() = resource_handle;
+}
 
 class SerializeIteratorOp : public OpKernel {
  public:
@@ -1344,6 +1201,4 @@
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
 
-}  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
new file mode 100644
index 0000000..e426feb
--- /dev/null
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/ops_util.h"
+
+namespace tensorflow {
+
+class IteratorResource;
+
+class IteratorHandleOp : public OpKernel {
+ public:
+  explicit IteratorHandleOp(OpKernelConstruction* ctx);
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel. Ideally the resource should be deleted when it is no longer held
+  // by anyone, but it would break backward compatibility.
+  ~IteratorHandleOp() override;
+
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_);
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(IteratorResource* resource);
+
+  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+  static inline To down_cast(From* f) {  // so we only accept pointers
+    static_assert(
+        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+        "target type not derived from source type");
+
+    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+    return static_cast<To>(f);
+  }
+
+  FunctionLibraryRuntime* CreatePrivateFLR(
+      OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr);
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+  string name_;
+};
+
+// Like IteratorHandleOp, but creates handles which are never shared, and does
+// not hold a reference to these handles. The latter is important for eager
+// execution, since OpKernel instances generally live as long as the program
+// running them.
+class AnonymousIteratorHandleOp : public OpKernel {
+ public:
+  explicit AnonymousIteratorHandleOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  // Coordinates Iterator unique name creation across AnonymousIteratorHandleOp
+  // instances.
+  static mutex static_resource_lookup_mutex_;
+  // current_id_ is just a hint for creating unique names. If it turns out
+  // there's a collision (e.g. because another AnonymousIteratorHandleOp
+  // instance is generating handles) we'll just skip that id.
+  static int64 current_id_ GUARDED_BY(static_resource_lookup_mutex_);
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+};
+
+class MakeIteratorOp : public OpKernel {
+ public:
+  explicit MakeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class IteratorGetNextOp : public AsyncOpKernel {
+ public:
+  explicit IteratorGetNextOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(),
+                           strings::StrCat("iterator_get_next_thread_",
+                                           SanitizeThreadSuffix(name()))) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  BackgroundWorker background_worker_;
+};
+
+class IteratorToStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class IteratorFromStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorFromStringHandleOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_ITERATOR_OPS_H_
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 004f153..0e17011 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -101,7 +101,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
             int64 num_parallel_calls, bool drop_remainder,
@@ -110,7 +110,7 @@
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           batch_size_(batch_size),
           num_parallel_calls_(num_parallel_calls),
@@ -144,11 +144,12 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), map_fn_.name()));
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
       Node* num_parallel_calls_node;
@@ -232,7 +233,7 @@
           cond_var_.wait(l);
         }
         CHECK_EQ(num_calls_, 0);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("call_counter"), call_counter_));
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
@@ -246,7 +247,7 @@
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("call_counter"), &call_counter_));
         int64 batch_results_size;
@@ -383,7 +384,7 @@
 #undef HANDLE_TYPE
           default:
             return errors::InvalidArgument("Unsupported data type: ",
-                                           value.dtype());
+                                           DataTypeString(value.dtype()));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index aa530ae..294fb1c 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -55,14 +55,14 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           captured_func_(std::move(captured_func)),
@@ -89,11 +89,12 @@
     string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
@@ -159,13 +160,13 @@
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
new file mode 100644
index 0000000..d66716e
--- /dev/null
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/batch_util.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+namespace tensorflow {
+namespace {
+
+void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
+                   bool always_collect_stats) {
+  opts->step_id = ctx->step_id();
+  opts->rendezvous = ctx->rendezvous();
+  opts->cancellation_manager = ctx->cancellation_manager();
+  if (always_collect_stats) {
+    opts->stats_collector = ctx->stats_collector();
+  }
+  opts->runner = ctx->runner();
+}
+
+class MapDefunOp : public AsyncOpKernel {
+ public:
+  explicit MapDefunOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto func_lib = ctx->function_library();
+    OP_REQUIRES(ctx, func_lib != nullptr,
+                errors::Internal("No function library."));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func));
+    OP_REQUIRES_OK(ctx,
+                   func_lib->Instantiate(func->name(), AttrSlice(&func->attr()),
+                                         &func_handle_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 0,
+                errors::InvalidArgument("Must have at least one input."));
+    OP_REQUIRES(ctx, ctx->num_outputs() >= 0,
+                errors::InvalidArgument("Must have at least one output."));
+    OP_REQUIRES(ctx, ctx->num_outputs() == output_shapes_.size(),
+                errors::InvalidArgument(
+                    "Length of output_shapes and output_types must match."));
+  }
+
+  ~MapDefunOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    int64 batch_size = ctx->input(0).dim_size(0);
+    // Inputs
+    auto* args = new std::vector<Tensor>;
+    auto* arg_shapes = new std::vector<TensorShape>;
+    arg_shapes->reserve(ctx->num_inputs());
+    args->reserve(ctx->num_inputs());
+
+    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
+      args->push_back(ctx->input(i));
+      arg_shapes->push_back(ctx->input(i).shape());
+      arg_shapes->at(i).RemoveDim(0);  // Remove the first batch dimension
+      OP_REQUIRES_ASYNC(
+          ctx, batch_size == ctx->input(i).dim_size(0),
+          errors::InvalidArgument("All inputs must have the same dimension 0."),
+          done);
+    }
+
+    // Outputs
+    auto* output = new OpOutputList;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->output_list("output", output), done);
+
+    for (size_t i = 0; i < output_types().size(); ++i) {
+      Tensor* out = nullptr;
+      TensorShape output_shape = output_shapes_.at(i);
+      output_shape.InsertDim(0, batch_size);
+      OP_REQUIRES_OK_ASYNC(ctx, output->allocate(i, output_shape, &out), done);
+    }
+
+    SetRunOptions(ctx, &opts_, false);
+
+    // Run loop
+    StatusCallback callback = std::bind(
+        [](OpKernelContext* ctx, std::vector<Tensor>* args,
+           std::vector<TensorShape>* arg_shapes, OpOutputList* output,
+           DoneCallback& done, const Status& status) {
+          delete args;
+          delete arg_shapes;
+          delete output;
+          ctx->SetStatus(status);
+          done();
+        },
+        ctx, args, arg_shapes, output, std::move(done), std::placeholders::_1);
+
+    auto* refcounted = new ReffedStatusCallback(std::move(callback));
+
+    for (size_t i = 1; i < static_cast<size_t>(batch_size); ++i) {
+      // Start from i = 1 because refcounted is initialized with refcount = 1
+      refcounted->Ref();
+    }
+    for (size_t i = 0; i < static_cast<size_t>(batch_size); ++i) {
+      auto* call_frame =
+          new MapFunctionCallFrame(*args, *arg_shapes, output, this, i);
+      ctx->function_library()->Run(
+          opts_, func_handle_, call_frame,
+          [call_frame, refcounted](const Status& func_status) {
+            delete call_frame;
+            refcounted->UpdateStatus(func_status);
+            refcounted->Unref();
+          });
+    }
+  }
+
+ private:
+  FunctionLibraryRuntime::Handle func_handle_;
+  FunctionLibraryRuntime::Options opts_;
+  std::vector<TensorShape> output_shapes_;
+
+  class MapFunctionCallFrame : public CallFrameInterface {
+   public:
+    MapFunctionCallFrame(const std::vector<Tensor>& args,
+                         const std::vector<TensorShape>& arg_shapes,
+                         OpOutputList* output, OpKernel* kernel, size_t iter)
+        : args_(args),
+          arg_shapes_(arg_shapes),
+          output_(output),
+          kernel_(kernel),
+          iter_(iter) {}
+
+    ~MapFunctionCallFrame() override {}
+
+    size_t num_args() const override { return args_.size(); }
+    size_t num_retvals() const override {
+      return static_cast<size_t>(kernel_->num_outputs());
+    }
+
+    Status GetArg(int index, Tensor* val) const override {
+      if (index < 0 || index >= args_.size()) {
+        return errors::InvalidArgument(
+            "Mismatch in number of function inputs.");
+      }
+      bool result = val->CopyFrom(args_.at(index).Slice(iter_, iter_ + 1),
+                                  arg_shapes_.at(index));
+      if (!result) {
+        return errors::Internal("GetArg failed.");
+      } else if (!val->IsAligned()) {
+        // Ensure alignment
+        *val = tensor::DeepCopy(*val);
+      }
+
+      return Status::OK();
+    }
+
+    Status SetRetval(int index, const Tensor& val) override {
+      if (index < 0 || index >= kernel_->num_outputs()) {
+        return errors::InvalidArgument(
+            "Mismatch in number of function outputs.");
+      }
+
+      if (val.dtype() != kernel_->output_type(index)) {
+        return errors::InvalidArgument(
+            "Mismatch in function return type and expected output type for "
+            "output: ",
+            index);
+      }
+      return batch_util::CopyElementToSlice(val, (*output_)[index], iter_);
+    }
+
+   private:
+    const std::vector<Tensor>& args_;
+    const std::vector<TensorShape>& arg_shapes_;
+    OpOutputList* output_;
+    const OpKernel* kernel_;
+    const size_t iter_;
+  };
+};  // namespace
+
+REGISTER_KERNEL_BUILDER(Name("MapDefun").Device(DEVICE_CPU), MapDefunOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 276f5f8..b097598 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -59,13 +59,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const std::vector<string>& optimizations,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           optimizations_(optimizations),
           output_types_(output_types),
@@ -80,15 +80,22 @@
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Optimize")}));
+      // We do not add a token for the optimization dataset to the prefix. The
+      // prefix is used to identify checkpoint elements and since the
+      // optimization dataset is excluded from the checkpoint, adding a token
+      // here would result in invalid checkpoint identifiers.
+      return std::unique_ptr<IteratorBase>(new Iterator({this, prefix}));
     }
 
     Status Optimize(OpKernelContext* ctx) {
       GraphDefBuilder b;
       DatasetGraphDefBuilder db(&b);
       Node* input_node = nullptr;
-      TF_RETURN_IF_ERROR(db.AddParentDataset(ctx, input_, &input_node));
+      SerializationContext::Params params;
+      params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+      SerializationContext serialization_ctx(params);
+      TF_RETURN_IF_ERROR(
+          db.AddInputDataset(&serialization_ctx, input_, &input_node));
       string output_node = input_node->name();
       GraphDef graph_def;
       TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
@@ -119,14 +126,12 @@
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
-      Node* optimizations_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddVector(optimizations_, &optimizations_node));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, optimizations_node}, output));
+      // We only serialize the optimized dataset to avoid re-running
+      // optimizations when the input pipeline is restored from a checkpoint.
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, optimized_input_, output));
       return Status::OK();
     }
 
@@ -157,13 +162,13 @@
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 59cbdb6..be45eac 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -98,12 +98,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
             std::vector<PartialTensorShape> padded_shapes,
             std::vector<Tensor> padding_values, const DatasetBase* input)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           batch_size_(batch_size),
           drop_remainder_(drop_remainder),
           padded_shapes_(std::move(padded_shapes)),
@@ -153,10 +153,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* batch_size = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
 
@@ -339,7 +340,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         else
           TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("exhausted"), ""));
         return Status::OK();
@@ -353,7 +354,7 @@
         } else {
           TF_RETURN_IF_ERROR(
               dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 6292b45..e492a82 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -92,7 +92,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func,
@@ -100,7 +100,7 @@
             int64 block_length, bool sloppy, int64 buffer_output_elements,
             int64 prefetch_input_elements, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           interleave_func_(func),
           captured_func_(std::move(captured_func)),
@@ -134,11 +134,13 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      TF_RETURN_IF_ERROR(
+          b->AddFunction(ctx->flib_def(), interleave_func_.name()));
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* cycle_length_node;
       TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
       Node* block_length_node;
@@ -358,7 +360,7 @@
         mutex_lock l(mu_);
         mutex_lock ckpt_l(ckpt_mu_);
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_exhausted"), ""));
@@ -402,7 +404,7 @@
         mutex_lock l(mu_);
         mutex_lock ckpt_l(ckpt_mu_);
         if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
@@ -858,7 +860,7 @@
         string prefix = strings::StrCat("worker_thread_", index);
         if (worker_thread_states_[index].iterator != nullptr) {
           TF_RETURN_IF_ERROR(
-              SaveParent(writer, worker_thread_states_[index].iterator));
+              SaveInput(writer, worker_thread_states_[index].iterator));
         } else {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
@@ -909,7 +911,7 @@
           Status s = dataset::MakeIteratorFromInputElement(
               ctx, worker_thread_states_[index].input, index,
               dataset()->captured_func_.get(), prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, iterator));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
           worker_thread_states_[index].iterator.swap(iterator);
         }
         TF_RETURN_IF_ERROR(ReadStatusLocked(
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index b736b33..a407abf 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -67,14 +67,14 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func, int32 num_parallel_calls,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::unique_ptr<CapturedFunction> captured_func)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           num_parallel_calls_(num_parallel_calls),
@@ -113,11 +113,12 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       // Input: input_dataset
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
       // Input: other_arguments
       DataTypeVector other_arguments_types;
@@ -137,7 +138,7 @@
           b->AddScalar(num_parallel_calls_, &num_parallel_calls));
 
       // Attr: f
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       AttrValue f;
       b->BuildAttrValue(func_, &f);
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 10549df..4d32b71 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -78,7 +78,7 @@
       cond_var_.wait(l);
     }
     CHECK_EQ(num_calls_, 0);
-    TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+    TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(full_name("invocation_results.size"),
                             invocation_results_.size()));
@@ -107,7 +107,7 @@
   Status RestoreInternal(IteratorContext* ctx,
                          IteratorStateReader* reader) override {
     mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+    TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
     int64 invocation_results_size;
     TF_RETURN_IF_ERROR(reader->ReadScalar(
         full_name("invocation_results.size"), &invocation_results_size));
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index cc16108..50efbcb 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -14,347 +14,338 @@
 ==============================================================================*/
 #include <deque>
 
+#include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
+
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
 
-namespace {
-
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class PrefetchDatasetOp : public UnaryDatasetOpKernel {
+class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit PrefetchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        buffer_size_(buffer_size) {
+    input_->Ref();
+  }
+
+  ~Dataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(
+        new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return input_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return input_->output_shapes();
+  }
+
+  string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
  protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 buffer_size;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(ctx,
-                buffer_size >= 0 || buffer_size == PrefetchAutotuner::kAutoTune,
-                errors::InvalidArgument("buffer_size must be >= 0"));
-
-    *output = new Dataset(ctx, input, buffer_size);
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* buffer_size = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, buffer_size}, output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
-        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
-      input_->Ref();
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params),
+          auto_tuner_(params.dataset->buffer_size_) {}
+
+    ~Iterator() override {
+      // Signal the prefetch thread to terminate it. We will then
+      // join that thread when we delete `this->prefetch_thread_`.
+      //
+      // TODO(mrry): Replace this cancellation logic with a
+      // CancellationManager. The syntax would be more heavyweight,
+      // but it would be possible to thread a cancellation manager
+      // through the IteratorContext to upstream,
+      // potentially-blocking iterators, when we add these.
+      {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        cond_var_.notify_all();
+      }
     }
 
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
+    Status Initialize(IteratorContext* ctx) override {
+      return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return input_->output_shapes();
-    }
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsurePrefetchThreadStarted(ctx));
+        // Wait until the next element in the buffer has been
+        // produced, or we are shutting down.
+        while (!cancelled_ && buffer_.empty() && !prefetch_thread_finished_ &&
+               auto_tuner_.buffer_limit() != 0) {
+          auto_tuner_.RecordEmpty();
+          cond_var_.wait(l);
+        }
 
-    string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
+        if (cancelled_) {
+          return errors::Cancelled(
+              "PrefetchDatasetOp::Dataset::Iterator::GetNext");
+        }
+
+        if (!buffer_.empty()) {
+          return Consume(out_tensors, end_of_sequence);
+        }
+
+        if (prefetch_thread_finished_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        DCHECK_EQ(auto_tuner_.buffer_limit(), 0);
+      }
+
+      mutex_lock parent_l(parent_mu_);
+      mutex_lock l(mu_);
+      return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+    }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
-      Node* buffer_size = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      // Acquire both locks to ensure that the prefetch thread and
+      // all GetNext threads are blocked.
+      mutex_lock parent_l(parent_mu_);
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
       TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, buffer_size}, output));
+          writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
+      for (size_t i = 0; i < buffer_.size(); i++) {
+        auto& buffer_element = buffer_[i];
+        TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
+        if (buffer_element.status.ok()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("buffer[", i, "].size")),
+              buffer_element.value.size()));
+          for (size_t j = 0; j < buffer_element.value.size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("buffer[", i, "][", j, "]")),
+                buffer_element.value[j]));
+          }
+        }
+      }
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock parent_l(parent_mu_);
+      mutex_lock l(mu_);
+      buffer_.clear();
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+      size_t buffer_size;
+      {
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("buffer_size"), &temp));
+        buffer_size = static_cast<size_t>(temp);
+      }
+      for (size_t i = 0; i < buffer_size; i++) {
+        buffer_.emplace_back();
+        auto& buffer_element = buffer_.back();
+        TF_RETURN_IF_ERROR(ReadStatus(reader, i, &buffer_element.status));
+        if (buffer_element.status.ok()) {
+          size_t value_size;
+          {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("buffer[", i, "].size")), &temp));
+            value_size = static_cast<size_t>(temp);
+          }
+          buffer_element.value.reserve(value_size);
+          for (size_t j = 0; j < value_size; j++) {
+            buffer_element.value.emplace_back();
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("buffer[", i, "][", j, "]")),
+                &buffer_element.value.back()));
+          }
+        }
+      }
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            auto_tuner_(params.dataset->buffer_size_) {}
+    // A buffer element comprises a status and (if that status is
+    // OK) a vector of tensors, representing an element of the input dataset.
+    struct BufferElement {
+      // The producer sets `status` if getting the input element fails.
+      Status status;
+      // The buffered data element.
+      std::vector<Tensor> value;
+    };
 
-      ~Iterator() override {
-        // Signal the prefetch thread to terminate it. We will then
-        // join that thread when we delete `this->prefetch_thread_`.
-        //
-        // TODO(mrry): Replace this cancellation logic with a
-        // CancellationManager. The syntax would be more heavyweight,
-        // but it would be possible to thread a cancellation manager
-        // through the IteratorContext to upstream,
-        // potentially-blocking iterators, when we add these.
+    Status Consume(std::vector<Tensor>* out_tensors, bool* end_of_sequence)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // A new element is available. Forward the status from computing it, and
+      // (if we successfully got an element) the output values.
+      Status s = buffer_.front().status;
+      if (s.ok()) {
+        *out_tensors = std::move(buffer_.front().value);
+      }
+      buffer_.pop_front();
+      *end_of_sequence = false;
+
+      // Wake the prefetch thread, in case it has been waiting for space
+      // in the buffer. Also wake up threads from other calls to GetNext.
+      //
+      // TODO(mrry): Consider using different condition variables for
+      // GetNext and Prefetch.
+      cond_var_.notify_all();
+      return s;
+    }
+
+    Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (!prefetch_thread_) {
+        prefetch_thread_.reset(
+            ctx->env()->StartThread({}, "prefetch_thread",
+                                    std::bind(&Iterator::PrefetchThread, this,
+                                              new IteratorContext(*ctx))));
+      }
+      return Status::OK();
+    }
+
+    // Prefetches elements of the input, storing results in an internal
+    // buffer.
+    //
+    // It owns the iterator context passed to it.
+    void PrefetchThread(IteratorContext* ctx) {
+      std::unique_ptr<IteratorContext> cleanup(ctx);
+      while (true) {
+        std::vector<Tensor> value;
+
+        // 1. Wait for a slot in the buffer.
         {
           mutex_lock l(mu_);
-          cancelled_ = true;
-          cond_var_.notify_all();
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        {
-          mutex_lock l(mu_);
-          TF_RETURN_IF_ERROR(EnsurePrefetchThreadStarted(ctx));
-          // Wait until the next element in the buffer has been
-          // produced, or we are shutting down.
-          while (!cancelled_ && buffer_.empty() && !prefetch_thread_finished_ &&
-                 auto_tuner_.buffer_limit() != 0) {
-            auto_tuner_.RecordEmpty();
+          while (!cancelled_ && buffer_.size() >= auto_tuner_.buffer_limit()) {
             cond_var_.wait(l);
           }
 
           if (cancelled_) {
-            return errors::Cancelled(
-                "PrefetchDatasetOp::Dataset::Iterator::GetNext");
-          }
-
-          if (!buffer_.empty()) {
-            return Consume(out_tensors, end_of_sequence);
-          }
-
-          if (prefetch_thread_finished_) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          DCHECK_EQ(auto_tuner_.buffer_limit(), 0);
-        }
-
-        mutex_lock parent_l(parent_mu_);
-        mutex_lock l(mu_);
-        return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // Acquire both locks to ensure that the prefetch thread and
-        // all GetNext threads are blocked.
-        mutex_lock parent_l(parent_mu_);
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
-        for (size_t i = 0; i < buffer_.size(); i++) {
-          auto& buffer_element = buffer_[i];
-          TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
-          if (buffer_element.status.ok()) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("buffer[", i, "].size")),
-                buffer_element.value.size()));
-            for (size_t j = 0; j < buffer_element.value.size(); j++) {
-              TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  full_name(strings::StrCat("buffer[", i, "][", j, "]")),
-                  buffer_element.value[j]));
-            }
-          }
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock parent_l(parent_mu_);
-        mutex_lock l(mu_);
-        buffer_.clear();
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        size_t buffer_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("buffer_size"), &temp));
-          buffer_size = static_cast<size_t>(temp);
-        }
-        for (size_t i = 0; i < buffer_size; i++) {
-          buffer_.emplace_back();
-          auto& buffer_element = buffer_.back();
-          TF_RETURN_IF_ERROR(ReadStatus(reader, i, &buffer_element.status));
-          if (buffer_element.status.ok()) {
-            size_t value_size;
-            {
-              int64 temp;
-              TF_RETURN_IF_ERROR(reader->ReadScalar(
-                  full_name(strings::StrCat("buffer[", i, "].size")), &temp));
-              value_size = static_cast<size_t>(temp);
-            }
-            buffer_element.value.reserve(value_size);
-            for (size_t j = 0; j < value_size; j++) {
-              buffer_element.value.emplace_back();
-              TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  full_name(strings::StrCat("buffer[", i, "][", j, "]")),
-                  &buffer_element.value.back()));
-            }
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // A buffer element comprises a status and (if that status is
-      // OK) a vector of tensors, representing an element of the input dataset.
-      struct BufferElement {
-        // The producer sets `status` if getting the input element fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> value;
-      };
-
-      Status Consume(std::vector<Tensor>* out_tensors, bool* end_of_sequence)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // A new element is available. Forward the status from computing it, and
-        // (if we successfully got an element) the output values.
-        Status s = buffer_.front().status;
-        if (s.ok()) {
-          *out_tensors = std::move(buffer_.front().value);
-        }
-        buffer_.pop_front();
-        *end_of_sequence = false;
-
-        // Wake the prefetch thread, in case it has been waiting for space
-        // in the buffer. Also wake up threads from other calls to GetNext.
-        //
-        // TODO(mrry): Consider using different condition variables for
-        // GetNext and Prefetch.
-        cond_var_.notify_all();
-        return s;
-      }
-
-      Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!prefetch_thread_) {
-          prefetch_thread_.reset(
-              ctx->env()->StartThread({}, "prefetch_thread",
-                                      std::bind(&Iterator::PrefetchThread, this,
-                                                new IteratorContext(*ctx))));
-        }
-        return Status::OK();
-      }
-
-      // Prefetches elements of the input, storing results in an internal
-      // buffer.
-      //
-      // It owns the iterator context passed to it.
-      void PrefetchThread(IteratorContext* ctx) {
-        std::unique_ptr<IteratorContext> cleanup(ctx);
-        while (true) {
-          std::vector<Tensor> value;
-
-          // 1. Wait for a slot in the buffer.
-          {
-            mutex_lock l(mu_);
-            while (!cancelled_ &&
-                   buffer_.size() >= auto_tuner_.buffer_limit()) {
-              cond_var_.wait(l);
-            }
-
-            if (cancelled_) {
-              return;
-            }
-          }
-
-          // 2. Read the next element.
-          // Acquire the parent lock since we will be reading an element
-          // from the input iterator. Note that we do not wish to release
-          // this lock till we have added the fetched element to the
-          // `buffer_` else there will be local state that may be missed
-          // by SaveInternal.
-          mutex_lock parent_l(parent_mu_);
-          bool end_of_sequence;
-          BufferElement buffer_element;
-          buffer_element.status = input_impl_->GetNext(
-              ctx, &buffer_element.value, &end_of_sequence);
-          if (buffer_element.status.ok() && end_of_sequence) {
-            mutex_lock l(mu_);
-            prefetch_thread_finished_ = true;
-            cond_var_.notify_all();
             return;
           }
+        }
 
-          // 3. Signal that the element has been produced.
-          {
-            mutex_lock l(mu_);
-            buffer_.push_back(std::move(buffer_element));
-            cond_var_.notify_all();
-          }
+        // 2. Read the next element.
+        // Acquire the parent lock since we will be reading an element
+        // from the input iterator. Note that we do not wish to release
+        // this lock till we have added the fetched element to the
+        // `buffer_` else there will be local state that may be missed
+        // by SaveInternal.
+        mutex_lock parent_l(parent_mu_);
+        bool end_of_sequence;
+        BufferElement buffer_element;
+        buffer_element.status =
+            input_impl_->GetNext(ctx, &buffer_element.value, &end_of_sequence);
+        if (buffer_element.status.ok() && end_of_sequence) {
+          mutex_lock l(mu_);
+          prefetch_thread_finished_ = true;
+          cond_var_.notify_all();
+          return;
+        }
+
+        // 3. Signal that the element has been produced.
+        {
+          mutex_lock l(mu_);
+          buffer_.push_back(std::move(buffer_element));
+          cond_var_.notify_all();
         }
       }
+    }
 
-      Status WriteStatus(IteratorStateWriter* writer, size_t index,
-                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            CodeKey(index), static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                 status.error_message()));
-        }
-        return Status::OK();
+    Status WriteStatus(IteratorStateWriter* writer, size_t index,
+                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          CodeKey(index), static_cast<int64>(status.code())));
+      if (!status.ok()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
+                                               status.error_message()));
       }
+      return Status::OK();
+    }
 
-      Status ReadStatus(IteratorStateReader* reader, size_t index,
-                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
+    Status ReadStatus(IteratorStateReader* reader, size_t index, Status* status)
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      int64 code_int;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+      error::Code code = static_cast<error::Code>(code_int);
 
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(ErrorMessageKey(index), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
+      if (code != error::Code::OK) {
+        string error_message;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(ErrorMessageKey(index), &error_message));
+        *status = Status(code, error_message);
+      } else {
+        *status = Status::OK();
       }
+      return Status::OK();
+    }
 
-      string CodeKey(size_t index) {
-        return full_name(strings::StrCat("status[", index, "].code"));
-      }
+    string CodeKey(size_t index) {
+      return full_name(strings::StrCat("status[", index, "].code"));
+    }
 
-      string ErrorMessageKey(size_t index) {
-        return full_name(strings::StrCat("status[", index, "].error_message"));
-      }
+    string ErrorMessageKey(size_t index) {
+      return full_name(strings::StrCat("status[", index, "].error_message"));
+    }
 
-      // This mutex is used to ensure exclusivity between multiple threads
-      // reading/writing this iterator's local state.
-      mutex mu_;
-      // This mutex is used to ensure exclusivity between multiple threads
-      // accessing the parent iterator. We keep this separate from `mu_` to
-      // allow prefetching to run in parallel with GetNext calls.
-      mutex parent_mu_ ACQUIRED_BEFORE(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
-      condition_variable cond_var_;
-      PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
-      std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-      std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
-    };
-
-    const DatasetBase* const input_;
-    const int64 buffer_size_;
+    // This mutex is used to ensure exclusivity between multiple threads
+    // reading/writing this iterator's local state.
+    mutex mu_;
+    // This mutex is used to ensure exclusivity between multiple threads
+    // accessing the parent iterator. We keep this separate from `mu_` to
+    // allow prefetching to run in parallel with GetNext calls.
+    mutex parent_mu_ ACQUIRED_BEFORE(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
+    condition_variable cond_var_;
+    PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
+    std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
+    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
+    bool cancelled_ GUARDED_BY(mu_) = false;
+    bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
   };
+  const DatasetBase* const input_;
+  const int64 buffer_size_;
 };
 
+void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                    DatasetBase** output) {
+  int64 buffer_size;
+  OP_REQUIRES_OK(ctx,
+                 ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+  OP_REQUIRES(ctx,
+              buffer_size >= 0 || buffer_size == PrefetchAutotuner::kAutoTune,
+              errors::InvalidArgument("buffer_size must be >= 0"));
+
+  *output = new Dataset(ctx, input, buffer_size);
+}
+
 REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU),
                         PrefetchDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")
@@ -363,6 +354,4 @@
                             .HostMemory("input_dataset")
                             .HostMemory("handle"),
                         PrefetchDatasetOp);
-}  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
new file mode 100644
index 0000000..c40c4b0
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
+
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+namespace tensorflow {
+
+class PrefetchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrefetchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index ff166c3..7817170 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -49,10 +49,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
-        : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
+        : DatasetBase(DatasetContext(ctx)), seed_(seed), seed2_(seed2) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
@@ -77,7 +77,8 @@
     }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* seed = nullptr;
       Node* seed2 = nullptr;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 0b5c814..aa38775 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -43,10 +43,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 start, int64 stop, int64 step)
-        : GraphDatasetBase(ctx), start_(start), stop_(stop), step_(step) {}
+        : DatasetBase(DatasetContext(ctx)),
+          start_(start),
+          stop_(stop),
+          step_(step) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
@@ -71,7 +74,8 @@
     }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* start = nullptr;
       Node* stop = nullptr;
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 29654b9..086b552 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -78,12 +78,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, std::vector<string> filenames,
             const string& compression_type,
             const io::ZlibCompressionOptions& options)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           compression_type_(compression_type),
           use_compression_(!compression_type.empty()),
@@ -109,7 +109,8 @@
     string DebugString() const override { return "TextLineDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* filenames = nullptr;
       Node* compression_type = nullptr;
@@ -311,12 +312,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      int64 header_bytes, int64 record_bytes, int64 footer_bytes,
                      int64 buffer_size)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_bytes_(header_bytes),
           record_bytes_(record_bytes),
@@ -345,7 +346,8 @@
     }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* filenames = nullptr;
       Node* header_bytes = nullptr;
@@ -529,11 +531,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      const string& compression_type, int64 buffer_size)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           compression_type_(compression_type),
           options_(io::RecordReaderOptions::CreateRecordReaderOptions(
@@ -563,7 +565,8 @@
     string DebugString() const override { return "TFRecordDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* filenames = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 6b3f4ed..5e9ace3 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -39,10 +39,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : GraphDatasetBase(ctx), count_(count), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -72,10 +72,11 @@
     string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* count = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
       TF_RETURN_IF_ERROR(
@@ -145,7 +146,7 @@
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
         } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         }
         return Status::OK();
       }
@@ -155,7 +156,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
@@ -205,7 +206,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         else
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("uninitialized"), ""));
@@ -220,7 +221,7 @@
         } else {
           TF_RETURN_IF_ERROR(
               dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index a3b2001..e4cb31e 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -69,7 +69,7 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func, std::vector<Tensor> initial_state,
@@ -77,7 +77,7 @@
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           initial_state_(std::move(initial_state)),
@@ -106,11 +106,12 @@
     string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx->flib_def(), func_.name()));
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       std::vector<Node*> initial_state_nodes;
       initial_state_nodes.reserve(initial_state_.size());
       for (const Tensor& t : initial_state_) {
@@ -222,7 +223,7 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         if (!state_.empty()) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("state_size"), state_.size()));
@@ -237,7 +238,7 @@
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         if (reader->Contains(full_name("state_size"))) {
           int64 size;
           TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index b859295..93a4376 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -22,6 +22,7 @@
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -39,11 +40,11 @@
 
  protected:
   // Abstract base dataset that implements a shuffling iterator.
-  class ShuffleDatasetBase : public GraphDatasetBase {
+  class ShuffleDatasetBase : public DatasetBase {
    public:
     ShuffleDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
                        int64 buffer_size, int64 count)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           buffer_size_(buffer_size),
           count_(count) {
@@ -75,7 +76,7 @@
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
         buffer_.reset(new std::vector<Tensor>[params.dataset->buffer_size_]);
-        slices_.emplace_back(new Slice{0, 0});
+        slices_.push_back(MakeUnique<Slice>(0, 0));
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -118,7 +119,7 @@
             }
             epoch_++;
             int64 n = slices_.back()->end;
-            slices_.emplace_back(new Slice{n, n});
+            slices_.push_back(MakeUnique<Slice>(n, n));
             TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
                 ctx, this->prefix(), &input_impl_));
           }
@@ -178,7 +179,7 @@
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               this->full_name("end_of_input_sequence"), ""));
         } else {
-          TF_RETURN_IF_ERROR(this->SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(this->SaveInput(writer, input_impl_));
         }
 
         // Save the epoch counter, buffer, and buffer slices.
@@ -226,7 +227,7 @@
         if (!reader->Contains(this->full_name("end_of_input_sequence"))) {
           TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
               ctx, this->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(this->RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(this->RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
@@ -251,7 +252,7 @@
           int64 end;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
               this->full_name(strings::StrCat("slices_end_", i)), &end));
-          slices_.emplace_back(new Slice{start, end});
+          slices_.push_back(MakeUnique<Slice>(start, end));
           for (size_t j = start; j < end; ++j) {
             size_t index = j % this->dataset()->buffer_size_;
             int64 list_size;
@@ -428,11 +429,12 @@
       }
     };
 
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       mutex_lock l(mu_);
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
       Node* seed = nullptr;
       Node* seed2 = nullptr;
@@ -498,10 +500,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
       Node* seed = nullptr;
       Node* seed2 = nullptr;
@@ -583,10 +586,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
       Node* seed = nullptr;
       Node* seed2 = nullptr;
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index b84afa3..fe7ef38 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -38,10 +38,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : GraphDatasetBase(ctx), count_(count), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -68,10 +68,11 @@
     string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* count = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
       TF_RETURN_IF_ERROR(
@@ -152,7 +153,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
@@ -165,7 +166,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 5765c61..14df3a6 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -63,11 +63,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 window_size, int64 window_shift,
             int64 window_stride, const DatasetBase* input)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           window_size_(window_size),
           window_shift_(window_shift),
           window_stride_(window_stride),
@@ -104,10 +104,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* window_size = nullptr;
       Node* window_shift = nullptr;
       Node* window_stride = nullptr;
@@ -228,7 +229,7 @@
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
         } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         }
         // Save buffer.
         TF_RETURN_IF_ERROR(writer->WriteScalar(strings::StrCat("buffer_size"),
@@ -248,7 +249,7 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index b5dff48..e526578 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -28,11 +28,11 @@
 // description of the following op.
 
 template <typename T>
-class Dataset : public GraphDatasetBase {
+class Dataset : public DatasetBase {
  public:
   explicit Dataset(OpKernelContext* ctx,
                    const sparse::SparseTensor& sparse_tensor)
-      : GraphDatasetBase(ctx),
+      : DatasetBase(DatasetContext(ctx)),
         sparse_tensor_(sparse_tensor),
         dtypes_({DT_INT64, sparse_tensor.dtype(), DT_INT64}),
         shapes_({{-1, sparse_tensor.dims() - 1},
@@ -55,7 +55,8 @@
   }
 
  protected:
-  Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
                             Node** output) const override {
     Node* indices_node;
     TF_RETURN_IF_ERROR(b->AddTensor(sparse_tensor_.indices(), &indices_node));
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 16652e7..2aa153f 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -75,13 +75,13 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const string& driver_name,
             const string& data_source_name, const string& query,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           driver_name_(driver_name),
           data_source_name_(data_source_name),
           query_(query),
@@ -105,7 +105,8 @@
     string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* driver_name_node;
       TF_RETURN_IF_ERROR(b->AddScalar(driver_name_, &driver_name_node));
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index 2ff90d7..75af73d 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -37,11 +37,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
                      StatsAggregatorResource* stats_aggregator_resource)
-        : GraphDatasetBase(ctx),
+        : DatasetBase(DatasetContext(ctx)),
           input_(input),
           stats_aggregator_resource_(stats_aggregator_resource) {
       input_->Ref();
@@ -71,11 +71,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented(
-          "Cannot currently serialize the `stats_aggregator` for a "
-          "SetStatsAggregatorDataset.");
+      return errors::Unimplemented("%s does not support serialization",
+                                   DebugString());
     }
 
    private:
@@ -111,14 +111,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 58ec3d4..52753a3 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -49,10 +49,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
-        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          tag_(std::move(tag)) {
       input_->Ref();
     }
 
@@ -76,10 +78,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* tag_node;
       TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
@@ -114,14 +117,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
@@ -148,10 +151,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
-        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          tag_(std::move(tag)) {
       input_->Ref();
     }
 
@@ -175,10 +180,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* tag_node;
       TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
@@ -215,14 +221,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
@@ -253,10 +259,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
-        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          tag_(std::move(tag)) {
       input_->Ref();
     }
 
@@ -280,10 +288,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
       Node* tag_node;
       TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
@@ -406,14 +415,14 @@
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 3d29221..e5c237d 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -38,10 +38,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : GraphDatasetBase(ctx), count_(count), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -69,10 +69,11 @@
     string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* count = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
       TF_RETURN_IF_ERROR(
@@ -139,7 +140,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
@@ -152,7 +153,7 @@
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 36fc434..fc21c32 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -43,10 +43,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, std::vector<Tensor> tensors)
-        : GraphDatasetBase(ctx), tensors_(std::move(tensors)) {
+        : DatasetBase(DatasetContext(ctx)), tensors_(std::move(tensors)) {
       for (const Tensor& t : tensors_) {
         dtypes_.push_back(t.dtype());
         shapes_.emplace_back(t.shape().dim_sizes());
@@ -67,7 +67,8 @@
     string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       std::vector<Node*> components;
       components.reserve(tensors_.size());
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
index 29b4c90..ccd5e60 100644
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -61,14 +61,14 @@
 
 class EnqueueInQueueDatasetOp;
 
-class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
+class PrependFromQueueAndPaddedBatchDataset : public DatasetBase {
  public:
   PrependFromQueueAndPaddedBatchDataset(
       OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
       const DataTypeVector& dtypes,
       const std::vector<PartialTensorShape>& shapes,
       std::vector<Tensor> padding_values)
-      : GraphDatasetBase(ctx),
+      : DatasetBase(DatasetContext(ctx)),
         batch_size_(batch_size),
         input_(input),
         dtypes_(dtypes),
@@ -99,10 +99,11 @@
   }
 
  protected:
-  Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
                             Node** output) const override {
     Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
     Node* batch_size = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
 
@@ -352,7 +353,7 @@
       Status Save(Iterator* iter, IteratorStateWriter* writer) {
         mutex_lock lock(mu_);
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(iter->SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(iter->full_name("input_exhausted"), ""));
@@ -378,7 +379,7 @@
         } else {
           TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
               ctx, iter->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(iter->RestoreInput(ctx, reader, input_impl_));
         }
         entries_.clear();
         int64 entries_size = -1;
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 68ce324..5b051e0 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -54,10 +54,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<Tensor> tensors)
-        : GraphDatasetBase(ctx), tensors_(std::move(tensors)) {
+        : DatasetBase(DatasetContext(ctx)), tensors_(std::move(tensors)) {
       for (const Tensor& t : tensors_) {
         dtypes_.push_back(t.dtype());
         gtl::InlinedVector<int64, 4> partial_dim_sizes;
@@ -86,7 +86,8 @@
     }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       std::vector<Node*> components;
       components.reserve(tensors_.size());
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 2aec9fb..1a79f72 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -35,10 +35,10 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
-        : GraphDatasetBase(ctx), input_(input) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
       for (const PartialTensorShape& shape : input->output_shapes()) {
         gtl::InlinedVector<int64, 4> partial_dim_sizes;
@@ -65,10 +65,11 @@
     string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
       return Status::OK();
     }
@@ -142,7 +143,7 @@
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         } else {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
@@ -164,7 +165,7 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 17551bc..0ab6bea 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -13,17 +13,18 @@
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/window_dataset.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace {
 
-// TODO(b/110981596): Support checkpointing.
 class WindowDataset : public DatasetBase {
  public:
   WindowDataset(std::vector<std::vector<Tensor>> elements,
                 DataTypeVector output_types,
                 std::vector<PartialTensorShape> output_shapes)
-      : elements_(std::move(elements)),
+      : DatasetBase(DatasetContext({"Window"})),
+        elements_(std::move(elements)),
         output_types_(std::move(output_types)),
         output_shapes_(std::move(output_shapes)) {}
 
@@ -41,6 +42,15 @@
 
   string DebugString() const override { return "WindowDataset"; }
 
+ protected:
+  // TODO(b/110981596): Support checkpointing.
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    return errors::Unimplemented("%s does not support serialization",
+                                 DebugString());
+  }
+
  private:
   class Iterator : public DatasetIterator<WindowDataset> {
    public:
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 0283e56..41bf9d4 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -43,10 +43,12 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, int64 window_size, const DatasetBase* input)
-        : GraphDatasetBase(ctx), window_size_(window_size), input_(input) {
+        : DatasetBase(DatasetContext(ctx)),
+          window_size_(window_size),
+          input_(input) {
       input_->Ref();
     }
 
@@ -74,10 +76,11 @@
     }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* window_size = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size));
       TF_RETURN_IF_ERROR(
@@ -162,7 +165,7 @@
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
         } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         }
         return Status::OK();
       }
@@ -171,7 +174,7 @@
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 80d9a5b..1c49874 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -70,20 +70,21 @@
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
+          dataset->MakeIterator(IteratorContext(ctx), "ToTFRecordOpIterator",
+                                &iterator),
           done);
 
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
       do {
-        OP_REQUIRES_OK_ASYNC(
-            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-            done);
+        OP_REQUIRES_OK_ASYNC(ctx,
+                             iterator->GetNext(IteratorContext(ctx),
+                                               &components, &end_of_sequence),
+                             done);
 
         if (!end_of_sequence) {
           OP_REQUIRES_OK_ASYNC(
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 0070523..e430657 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -38,11 +38,11 @@
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
     explicit Dataset(OpKernelContext* ctx,
                      const std::vector<DatasetBase*>& inputs)
-        : GraphDatasetBase(ctx), inputs_(inputs) {
+        : DatasetBase(DatasetContext(ctx)), inputs_(inputs) {
       for (const auto& input : inputs_) {
         input->Ref();
         for (DataType dt : input->output_dtypes()) {
@@ -77,13 +77,14 @@
     string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
                               Node** output) const override {
       std::vector<Node*> input_graph_nodes;
       input_graph_nodes.reserve(inputs_.size());
       for (const auto& input : inputs_) {
         Node* input_node;
-        TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input, &input_node));
+        TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &input_node));
         input_graph_nodes.emplace_back(input_node);
       }
       TF_RETURN_IF_ERROR(b->AddDataset(
@@ -142,7 +143,7 @@
               writer->WriteScalar(full_name("input_impls_empty"), ""));
         } else {
           for (auto& input_impl : input_impls_)
-            TF_RETURN_IF_ERROR(SaveParent(writer, input_impl));
+            TF_RETURN_IF_ERROR(SaveInput(writer, input_impl));
         }
         return Status::OK();
       }
@@ -155,7 +156,7 @@
         } else {
           DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
           for (auto& input_impl : input_impls_)
-            TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl));
+            TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index d5c33c0..bfdabc3 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -16,13 +16,13 @@
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/kernels/function_ops.h"
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
-#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/gradients.h"
@@ -33,64 +33,40 @@
 
 namespace tensorflow {
 
-static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
-static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
 static const char* const kGradientOp = FunctionLibraryDefinition::kGradientOp;
 
-class ArgOp : public OpKernel {
- public:
-  explicit ArgOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
-  }
+ArgOp::ArgOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
+}
 
-  void Compute(OpKernelContext* ctx) override {
-    auto frame = ctx->call_frame();
-    OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame"));
-    Tensor val;
-    OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
-    OP_REQUIRES(ctx, val.dtype() == dtype_,
-                errors::InvalidArgument(
-                    "Type mismatch: actual ", DataTypeString(val.dtype()),
-                    " vs. expect ", DataTypeString(dtype_)));
-    ctx->set_output(0, val);
-  }
+void ArgOp::Compute(OpKernelContext* ctx) {
+  auto frame = ctx->call_frame();
+  OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame"));
+  Tensor val;
+  OP_REQUIRES_OK(ctx, frame->GetArg(index_, &val));
+  OP_REQUIRES(ctx, val.dtype() == dtype_,
+              errors::InvalidArgument("Type mismatch: actual ",
+                                      DataTypeString(val.dtype()),
+                                      " vs. expect ", DataTypeString(dtype_)));
+  ctx->set_output(0, val);
+}
 
-  bool IsExpensive() override { return false; }
+RetvalOp::RetvalOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
+}
 
- private:
-  int index_;
-  DataType dtype_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
-};
-
-class RetvalOp : public OpKernel {
- public:
-  explicit RetvalOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("index", &index_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& val = ctx->input(0);
-    OP_REQUIRES(ctx, val.dtype() == dtype_,
-                errors::InvalidArgument(
-                    "Type mismatch: actual ", DataTypeString(val.dtype()),
-                    " vs. expect ", DataTypeString(dtype_)));
-    auto frame = ctx->call_frame();
-    OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame"));
-    OP_REQUIRES_OK(ctx, frame->SetRetval(index_, val));
-  }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  int index_;
-  DataType dtype_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
-};
+void RetvalOp::Compute(OpKernelContext* ctx) {
+  const Tensor& val = ctx->input(0);
+  OP_REQUIRES(ctx, val.dtype() == dtype_,
+              errors::InvalidArgument("Type mismatch: actual ",
+                                      DataTypeString(val.dtype()),
+                                      " vs. expect ", DataTypeString(dtype_)));
+  auto frame = ctx->call_frame();
+  OP_REQUIRES(ctx, frame != nullptr, errors::Internal("no call frame"));
+  OP_REQUIRES_OK(ctx, frame->SetRetval(index_, val));
+}
 
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
@@ -304,123 +280,105 @@
 
 #endif  // TENSORFLOW_USE_SYCL
 
-class RemoteCallOp : public AsyncOpKernel {
- public:
-  explicit RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_dtypes_));
+RemoteCallOp::RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, &func_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_dtypes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_dtypes_));
+}
+
+void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  FunctionLibraryRuntime* lib = ctx->function_library();
+  OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                    errors::Internal("No function library is provided."), done);
+
+  const string& source_device = lib->device()->name();
+  const Tensor* target;
+  OP_REQUIRES_OK_ASYNC(ctx, ctx->input("target", &target), done);
+  string target_device;
+  OP_REQUIRES_OK_ASYNC(
+      ctx,
+      DeviceNameUtils::CanonicalizeDeviceName(target->scalar<string>()(),
+                                              source_device, &target_device),
+      done);
+
+  AttrValueMap attr_values = func_.attr();
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = target_device;
+
+  FunctionTarget function_target = {target_device, lib};
+
+  FunctionLibraryRuntime::Handle handle;
+  {
+    mutex_lock l(mu_);
+    auto cached_entry = handle_cache_.find(function_target);
+    if (cached_entry != handle_cache_.end()) {
+      handle = cached_entry->second;
+    } else {
+      VLOG(1) << "Instantiating " << func_.name() << " on " << target_device;
+      tracing::ScopedActivity activity(strings::StrCat(
+          "RemoteCall: Instantiate: ", func_.name(), " on ", target_device));
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          lib->Instantiate(func_.name(), AttrSlice(&attr_values),
+                           instantiate_opts, &handle),
+          done);
+      auto insert_result = handle_cache_.insert({function_target, handle});
+      CHECK(insert_result.second) << "Insert unsuccessful.";
+      VLOG(1) << "Instantiated " << func_.name() << " on " << target_device
+              << ", resulting in handle: " << handle << " flr: " << lib;
+    }
   }
 
-  ~RemoteCallOp() override {}
+  OpInputList arguments;
+  OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    FunctionLibraryRuntime* lib = ctx->function_library();
-    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
-                      errors::Internal("No function library is provided."),
-                      done);
-
-    const string& source_device = lib->device()->name();
-    const Tensor* target;
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->input("target", &target), done);
-    string target_device;
-    OP_REQUIRES_OK_ASYNC(
-        ctx,
-        DeviceNameUtils::CanonicalizeDeviceName(target->scalar<string>()(),
-                                                source_device, &target_device),
-        done);
-
-    AttrValueMap attr_values = func_.attr();
-    FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
-    instantiate_opts.target = target_device;
-
-    FunctionTarget function_target = {target_device, lib};
-
-    FunctionLibraryRuntime::Handle handle;
-    {
-      mutex_lock l(mu_);
-      auto cached_entry = handle_cache_.find(function_target);
-      if (cached_entry != handle_cache_.end()) {
-        handle = cached_entry->second;
-      } else {
-        VLOG(1) << "Instantiating " << func_.name() << " on " << target_device;
-        tracing::ScopedActivity activity(strings::StrCat(
-            "RemoteCall: Instantiate: ", func_.name(), " on ", target_device));
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&attr_values),
-                             instantiate_opts, &handle),
-            done);
-        auto insert_result = handle_cache_.insert({function_target, handle});
-        CHECK(insert_result.second) << "Insert unsuccessful.";
-        VLOG(1) << "Instantiated " << func_.name() << " on " << target_device
-                << ", resulting in handle: " << handle << " flr: " << lib;
-      }
+  FunctionLibraryRuntime::Options opts;
+  opts.step_id = ctx->step_id();
+  opts.runner = ctx->runner();
+  opts.source_device = source_device;
+  if (opts.source_device != target_device) {
+    opts.remote_execution = true;
+  }
+  opts.create_rendezvous = true;
+  std::vector<Tensor> args;
+  args.reserve(arguments.size());
+  for (const Tensor& argument : arguments) {
+    args.push_back(argument);
+  }
+  for (const auto& dtype : input_dtypes_) {
+    AllocatorAttributes arg_alloc_attrs;
+    if (DataTypeAlwaysOnHost(dtype)) {
+      arg_alloc_attrs.set_on_host(true);
     }
-
-    OpInputList arguments;
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
-
-    FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
-    opts.runner = ctx->runner();
-    opts.source_device = source_device;
-    if (opts.source_device != target_device) {
-      opts.remote_execution = true;
+    opts.args_alloc_attrs.push_back(arg_alloc_attrs);
+  }
+  for (const auto& dtype : output_dtypes_) {
+    AllocatorAttributes ret_alloc_attrs;
+    if (DataTypeAlwaysOnHost(dtype)) {
+      ret_alloc_attrs.set_on_host(true);
     }
-    opts.create_rendezvous = true;
-    std::vector<Tensor> args;
-    args.reserve(arguments.size());
-    for (const Tensor& argument : arguments) {
-      args.push_back(argument);
-    }
-    for (const auto& dtype : input_dtypes_) {
-      AllocatorAttributes arg_alloc_attrs;
-      if (DataTypeAlwaysOnHost(dtype)) {
-        arg_alloc_attrs.set_on_host(true);
-      }
-      opts.args_alloc_attrs.push_back(arg_alloc_attrs);
-    }
-    for (const auto& dtype : output_dtypes_) {
-      AllocatorAttributes ret_alloc_attrs;
-      if (DataTypeAlwaysOnHost(dtype)) {
-        ret_alloc_attrs.set_on_host(true);
-      }
-      opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
-    }
-    auto* rets = new std::vector<Tensor>;
-    auto* activity = new tracing::ScopedActivity(strings::StrCat(
-        "RemoteCall: Run: ", func_.name(), " on ", target_device));
-    VLOG(1) << "Running " << func_.name() << " on " << target_device
-            << " with handle: " << handle;
-    lib->Run(opts, handle, args, rets,
-             [rets, activity, done, ctx](const Status& status) {
-               if (!status.ok()) {
-                 ctx->SetStatus(status);
-               } else {
-                 for (size_t i = 0; i < rets->size(); ++i) {
-                   ctx->set_output(i, (*rets)[i]);
-                 }
+    opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
+  }
+  auto* rets = new std::vector<Tensor>;
+  auto* activity = new tracing::ScopedActivity(strings::StrCat(
+      "RemoteCall: Run: ", func_.name(), " on ", target_device));
+  VLOG(1) << "Running " << func_.name() << " on " << target_device
+          << " with handle: " << handle;
+  lib->Run(opts, handle, args, rets,
+           [rets, activity, done, ctx](const Status& status) {
+             if (!status.ok()) {
+               ctx->SetStatus(status);
+             } else {
+               for (size_t i = 0; i < rets->size(); ++i) {
+                 ctx->set_output(i, (*rets)[i]);
                }
-               delete rets;
-               delete activity;
-               done();
-             });
-  }
-
- private:
-  NameAttrList func_;
-  DataTypeVector input_dtypes_;
-  DataTypeVector output_dtypes_;
-
-  mutex mu_;
-  typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
-  std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
-      GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
-};
+             }
+             delete rets;
+             delete activity;
+             done();
+           });
+}
 
 REGISTER_KERNEL_BUILDER(
     Name("RemoteCall").Device(DEVICE_CPU).HostMemory("target"), RemoteCallOp);
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
new file mode 100644
index 0000000..9e88cc6
--- /dev/null
+++ b/tensorflow/core/kernels/function_ops.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+
+class ArgOp : public OpKernel {
+ public:
+  explicit ArgOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  int index_;
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
+};
+
+class RetvalOp : public OpKernel {
+ public:
+  explicit RetvalOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+  bool IsExpensive() override { return false; }
+
+ private:
+  int index_;
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
+};
+
+class RemoteCallOp : public AsyncOpKernel {
+ public:
+  explicit RemoteCallOp(OpKernelConstruction* ctx);
+
+  ~RemoteCallOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  NameAttrList func_;
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
+
+  mutex mu_;
+  typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
+  std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
+      GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_FUNCTION_OPS_H_
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 1c0abf2..1529d2e 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -218,6 +218,10 @@
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
+REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StatelessIf").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
+
 class WhileOp : public AsyncOpKernel {
  public:
   explicit WhileOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
@@ -379,6 +383,9 @@
 REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
 REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
 
+REGISTER_KERNEL_BUILDER(Name("StatelessWhile").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("StatelessWhile").Device(DEVICE_GPU), WhileOp);
+
 Status GetScalar(OpKernelContext* ctx, int index, int32* value,
                  const char* label) {
   Tensor t = ctx->input(index);
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 4b30c1f..1c80844 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -24,6 +24,9 @@
 #error "EIGEN_USE_THREADS must be enabled by all .cc files including this."
 #endif  // EIGEN_USE_THREADS
 
+#ifndef TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+#define TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+
 #include <string.h>
 #include <map>
 #include <vector>
@@ -116,3 +119,5 @@
   }
 };
 #endif  // USE_CBLAS_GEMM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
new file mode 100644
index 0000000..d08a7c9
--- /dev/null
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/host_constant_op.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+_HostConstantOp::_HostConstantOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+  const TensorProto* proto = nullptr;
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+  OP_REQUIRES_OK(
+      ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
+  OP_REQUIRES(
+      ctx, ctx->output_type(0) == tensor_.dtype(),
+      errors::InvalidArgument("Type mismatch between value (",
+                              DataTypeString(tensor_.dtype()), ") and dtype (",
+                              DataTypeString(ctx->output_type(0)), ")"));
+}
+
+void _HostConstantOp::Compute(OpKernelContext* ctx) {
+  ctx->set_output(0, tensor_);
+}
+
+#if GOOGLE_CUDA
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        _HostConstantOp);
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        _HostConstantOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+// HostConst: forced to generate output on the host.
+// Only used in tests; no op is registered for this kernel
+// externally (i.e., in array_ops.cc)
+REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), _HostConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"),
+    _HostConstantOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // end namespace tensorflow
+
diff --git a/tensorflow/core/kernels/host_constant_op.h b/tensorflow/core/kernels/host_constant_op.h
new file mode 100644
index 0000000..1b887ea
--- /dev/null
+++ b/tensorflow/core/kernels/host_constant_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// HostConstantOp differs from ConstantOp in that its output is always
+// in host memory.
+class _HostConstantOp : public OpKernel {
+ public:
+  explicit _HostConstantOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  ~_HostConstantOp() override {}
+
+ private:
+  Tensor tensor_;
+  TF_DISALLOW_COPY_AND_ASSIGN(_HostConstantOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index faf997b..8dcb597 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -142,7 +142,7 @@
     // always be a float.
     OP_REQUIRES(context, input.dtype() == DT_FLOAT,
                 errors::InvalidArgument("input_grad must be of type float",
-                                        input.dtype()));
+                                        DataTypeString(input.dtype())));
 
     OP_REQUIRES(context, original_image.dims() == 4,
                 errors::InvalidArgument("original_image must be 4-dimensional",
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 8ddf3c3..2363fbc 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -55,7 +55,8 @@
     TF_CALL_variant(CASE);
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", value.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(value.dtype()));
   }
 }
 
@@ -71,7 +72,8 @@
     TF_CALL_GPU_NUMBER_TYPES_NO_HALF(CASE);
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", value.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(value.dtype()));
   }
 }
 #endif  // TENSORFLOW_USE_SYCL
@@ -347,7 +349,8 @@
     TF_CALL_NUMBER_TYPES(CASE);
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(v.dtype()));
   }
   return Status::OK();
 }
@@ -415,7 +418,8 @@
     TF_CALL_bool(CASE);
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", x.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(x.dtype()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index f1616b1..9d20239 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -72,7 +72,8 @@
 // that CASE is not defined...hence the above construction
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", value.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(value.dtype()));
   }
   return Status::OK();
 }
@@ -149,7 +150,8 @@
     CASE(int64)
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+      return errors::InvalidArgument("Unsupported data type: ",
+                                     DataTypeString(v.dtype()));
   }
   return Status::OK();
 }
@@ -169,7 +171,8 @@
     CASE(int64)
 #undef CASE
     default:
-      return errors::InvalidArgument("Unsupported dtype: ", x.dtype());
+      return errors::InvalidArgument("Unsupported dtype: ",
+                                     DataTypeString(x.dtype()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 42871c6..b3f74c0 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -261,14 +261,15 @@
         out_tensor.flat<dtype>().constant(dtype(0));             \
     break;
 
-      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+      TF_CALL_POD_TYPES(DTYPE_CASE)
 
 #undef DTYPE_CASE
       default:
         return errors::InvalidArgument(
-            "Trying to compute zeros_like for unsupported dtype",
-            out_tensor.dtype());
+            "Trying to compute zeros_like for unsupported dtype ",
+            DataTypeString(out_tensor.dtype()));
     }
+    y->tensors.emplace_back(out_tensor);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 07e754a..2e8d9c6 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -341,7 +341,7 @@
 
   Status Find(OpKernelContext* ctx, const Tensor& key, Tensor* value,
               const Tensor& default_value) override LOCKS_EXCLUDED(mu_) {
-    const int64 num_elements = key.dim_size(0);
+    const int64 num_elements = (key.dims() == 0) ? 1 : key.dim_size(0);
     const int64 key_size = key_shape_.num_elements();
     const int64 value_size = value_shape_.num_elements();
     if (key.NumElements() != num_elements * key_size) {
@@ -403,8 +403,9 @@
 
   Status Insert(OpKernelContext* ctx, const Tensor& key,
                 const Tensor& value) override LOCKS_EXCLUDED(mu_) {
-    if (key.NumElements() != key.dim_size(0) * key_shape_.num_elements()) {
-      TensorShape expected_shape({key.dim_size(0)});
+    const int64 batch_size = (key.dims() == 0) ? 1 : key.dim_size(0);
+    if (key.NumElements() != batch_size * key_shape_.num_elements()) {
+      TensorShape expected_shape({batch_size});
       expected_shape.AppendShape(key_shape_);
       return errors::InvalidArgument("Expected key shape ",
                                      expected_shape.DebugString(), " got ",
@@ -415,7 +416,7 @@
     // rather than updates. That means we may grow the table even though we
     // don't need to. As long as the number of keys inserted in one call is
     // small compared to the size of the map, the impact of this is minimal.
-    const int64 pending_num_entries = num_entries_ + key.dim_size(0);
+    const int64 pending_num_entries = num_entries_ + batch_size;
     if (pending_num_entries > num_buckets_ * max_load_factor_) {
       int64 new_num_buckets = num_buckets_;
       do {
@@ -500,7 +501,7 @@
  private:
   Status DoInsert(OpKernelContext* ctx, const Tensor& key, const Tensor& value,
                   bool ignore_empty_key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    const int64 num_elements = key.dim_size(0);
+    const int64 num_elements = (key.dims() == 0) ? 1 : key.dim_size(0);
     const int64 value_size = value_shape_.num_elements();
     const int64 key_size = key_shape_.num_elements();
     const auto key_matrix = key.shaped<K, 2>({num_elements, key_size});
@@ -812,17 +813,21 @@
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32, string);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, string);
+REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(string, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, int32);
 REGISTER_KERNEL(string, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(int32, int32);
-REGISTER_KERNEL(int32, string);
 
 #undef REGISTER_KERNEL
 
@@ -843,12 +848,20 @@
       LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(int64, Variant);
+REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
@@ -869,10 +882,19 @@
       LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
@@ -893,13 +915,20 @@
       LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, float);
-REGISTER_KERNEL(int64, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, Variant);
+REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 77386a1..30fe4b0 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -242,7 +242,8 @@
         break;
       default:
         valid_ = false;
-        return errors::InvalidArgument("Data type ", dtype, " not supported.");
+        return errors::InvalidArgument("Data type ", DataTypeString(dtype),
+                                       " not supported.");
     }
     return Status::OK();
   }
@@ -326,8 +327,10 @@
                            DataType value_dtype, const string& table_name) {
   if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) {
     return errors::InvalidArgument(
-        "Conflicting key/value dtypes ", key_dtype, "->", value_dtype, " with ",
-        table.key_dtype(), "-", table.value_dtype(), " for table ", table_name);
+        "Conflicting key/value dtypes ", DataTypeString(key_dtype), "->",
+        DataTypeString(value_dtype), " with ",
+        DataTypeString(table.key_dtype()), "-",
+        DataTypeString(table.value_dtype()), " for table ", table_name);
   }
   return Status::OK();
 }
@@ -340,7 +343,7 @@
   if (key_index == kLineNumber && table->key_dtype() != DT_INT64) {
     return errors::InvalidArgument(
         "Key index for line number requires table key dtype of int64, got ",
-        table->key_dtype());
+        DataTypeString(table->key_dtype()));
   }
   const DataType& key_dtype = table->key_dtype();
   const DataType& value_dtype = table->value_dtype();
@@ -348,17 +351,17 @@
       key_dtype != DT_STRING) {
     return errors::InvalidArgument(
         "Key index for whole line requires string or integer table key, got ",
-        table->key_dtype());
+        DataTypeString(table->key_dtype()));
   }
   if (value_index == kLineNumber && value_dtype != DT_INT64) {
     return errors::InvalidArgument(
         "Value index for line number requires table value dtype of int64, got ",
-        table->value_dtype());
+        DataTypeString(table->value_dtype()));
   }
   if (value_index == kWholeLine && value_dtype != DT_STRING) {
     return errors::InvalidArgument(
         "Value index for whole line requires table value dtype of string, got ",
-        table->value_dtype());
+        DataTypeString(table->value_dtype()));
   }
 
   TextFileLineIterator iter;
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 80376c6..79967aa 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -578,25 +578,41 @@
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 
-// MKL does not support half and int32 types for matrix-multiplication, so
-// register the kernel to use default Eigen based implementations for these
-// types. Registration for NO-LABEL version is in mkl_matmul_op.cc
-TF_CALL_float(REGISTER_CPU_EIGEN);
-TF_CALL_double(REGISTER_CPU_EIGEN);
+// MKL does not support half, bfloat16 and int32 types for
+// matrix-multiplication, so register the kernel to use default Eigen based
+// implementations for these types. REGISTER_CPU defines two versions - Eigen
+// label and NO-LABEL
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
-
 TF_CALL_int32(REGISTER_CPU);
+
+// Float is supported in both MKL DNN as well as in MKL ML
+// Registration for NO-LABEL version is in mkl_matmul_op.cc for types supported
+// by MKL. However we define Eigen label version here just to pass a few unit
+// tests
+TF_CALL_float(REGISTER_CPU_EIGEN);
+
+// MKL DNN does not support complex64/complex128/double, if user specifies
+// to use only opensource MKL DNN then use default implementation for these
+// types otherwise use GEMM from MKL ML binary
+
+#if defined(INTEL_MKL_DNN_ONLY)
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+#else  // INTEL_MKL_DNN_ONLY
 TF_CALL_complex64(REGISTER_CPU_EIGEN);
 TF_CALL_complex128(REGISTER_CPU_EIGEN);
-#else
+TF_CALL_double(REGISTER_CPU_EIGEN);
+#endif
+
+#else  // INTEL MKL
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
-
 TF_CALL_int32(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
index 0e09078..00a05a8 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 
 #include "third_party/eigen3/Eigen/Cholesky"
@@ -159,3 +162,5 @@
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 3d04aee..28edf51 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -24,8 +24,7 @@
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
@@ -38,7 +37,7 @@
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
@@ -286,7 +285,7 @@
   } MklAddNOpContext;
 };
 
-#else  // INTEL_MKL_ML
+#else  // INTEL_MKL_ML_ONLY
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d3566c2..969baec 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,7 +24,7 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
 using mkldnn::engine;
@@ -40,7 +40,7 @@
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
@@ -664,7 +664,7 @@
   }
 };  // MklAvgPoolingGradOp
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 45328b0..0841395 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -25,7 +25,7 @@
 
 #define EIGEN_USE_THREADS
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL) && !defined(INTEL_MKL_DNN_ONLY)
 #include <vector>
 #include "mkl_cblas.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d8efb1b..8ad7ebb 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -27,8 +27,7 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::concat;
@@ -64,7 +63,7 @@
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
@@ -232,7 +231,7 @@
 #endif
 };
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index f857be6..7c687f6 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,7 +18,7 @@
 // bias.
 
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -39,7 +39,7 @@
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #endif
@@ -265,5 +265,5 @@
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
-#endif /* INTEL_MKL_ML */
+#endif /* INTEL_MKL_ML_ONLY */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index b73a119..afbfaa8 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,8 +38,7 @@
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_weights;
@@ -56,7 +55,7 @@
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 struct MklConvBwdFilterParams {
   memory::dims src_dims;
@@ -83,11 +82,11 @@
 };
 
 template <typename T>
-class MklConv2DBwdFilterPrimitive : public MklPrimitive {
+class MklConvBwdFilterPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DBwdFilterPrimitive(
-      const MklConvBwdFilterParams& convBwdFilterDims) :
-            cpu_engine_(engine::cpu, 0) {
+  explicit MklConvBwdFilterPrimitive(
+      const MklConvBwdFilterParams& convBwdFilterDims)
+      : cpu_engine_(engine::cpu, 0) {
     context_.bwd_filter_stream.reset(new stream(stream::kind::eager));
     // create conv primitive
     if (context_.conv_bwd_filter == nullptr) {
@@ -95,7 +94,7 @@
     }
   }
 
-  ~MklConv2DBwdFilterPrimitive() {}
+  ~MklConvBwdFilterPrimitive() {}
 
   // Convolution backward weights with bias
   //   src_data:         input data buffer of src
@@ -298,38 +297,36 @@
 };
 
 template <typename T>
-class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConv2DBwdFilterPrimitive<T>* Get(
+  static MklConvBwdFilterPrimitive<T>* Get(
       const MklConvBwdFilterParams& convBwdFilterDims) {
-    MklConv2DBwdFilterPrimitive<T>* conv2d_bwd_filter = nullptr;
+    MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
 
     // look into the pool for reusable primitive
-    conv2d_bwd_filter = dynamic_cast<MklConv2DBwdFilterPrimitive<T>*> (
-      MklConv2DBwdFilterPrimitiveFactory<T>::GetInstance().GetConv2dBwdFilter(
-      convBwdFilterDims));
+    conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*>(
+        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
+            convBwdFilterDims));
 
-    if (conv2d_bwd_filter == nullptr) {
-        conv2d_bwd_filter = new MklConv2DBwdFilterPrimitive<T>(
-            convBwdFilterDims);
-        MklConv2DBwdFilterPrimitiveFactory<T>::GetInstance().SetConv2dBwdFilter(
-            convBwdFilterDims, conv2d_bwd_filter);
+    if (conv_bwd_filter == nullptr) {
+      conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
+      MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
+          convBwdFilterDims, conv_bwd_filter);
     }
-    return conv2d_bwd_filter;
+    return conv_bwd_filter;
   }
 
-
  private:
-  MklConv2DBwdFilterPrimitiveFactory() {}
-  ~MklConv2DBwdFilterPrimitiveFactory() {}
+  MklConvBwdFilterPrimitiveFactory() {}
+  ~MklConvBwdFilterPrimitiveFactory() {}
 
-  static MklConv2DBwdFilterPrimitiveFactory& GetInstance() {
-    static MklConv2DBwdFilterPrimitiveFactory instance_;
+  static MklConvBwdFilterPrimitiveFactory& GetInstance() {
+    static MklConvBwdFilterPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvBwdFilterParams& convBwdFilterDims) {
-    string prefix = "conv2d_bwd_filter";
+    string prefix = "conv_bwd_filter";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdFilterDims.src_dims);
@@ -343,14 +340,14 @@
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2dBwdFilter(
+  MklPrimitive* GetConvBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims) {
     string key = CreateKey(convBwdFilterDims);
     return this->GetOp(key);
   }
 
-  void SetConv2dBwdFilter(
-      const MklConvBwdFilterParams& convBwdFilterDims, MklPrimitive* op) {
+  void SetConvBwdFilter(const MklConvBwdFilterParams& convBwdFilterDims,
+                        MklPrimitive* op) {
     string key = CreateKey(convBwdFilterDims);
     this->SetOp(key, op);
   }
@@ -358,7 +355,7 @@
 
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -739,14 +736,13 @@
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp
-    : public MklConv2DBackpropCommonOp<Device, T> {
+class MklConvCustomBackpropFilterOp
+    : public MklConvBackpropCommonOp<Device, T> {
  public:
-  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) {
-  }
+  explicit MklConvCustomBackpropFilterOp(OpKernelConstruction* context)
+      : MklConvBackpropCommonOp<Device, T>(context) {}
 
-  ~MklConv2DCustomBackpropFilterOp() {}
+  ~MklConvCustomBackpropFilterOp() {}
 
   void Compute(OpKernelContext* context) {
     try {
@@ -754,6 +750,9 @@
       MklDnnData<T> diff_dst(&cpu_engine_);
       MklDnnData<T> diff_filter(&cpu_engine_);  // output
 
+      // This flag indicates Conv2D or Conv3D
+      bool isConv2D = (this->strides_.size() == 4);
+
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
       const Tensor& src_tensor = MklGetInput(context, kInputIdx);
@@ -814,7 +813,10 @@
           &fwd_dst_dims, &padding_left, &padding_right);
       if (!context->status().ok()) return;
 
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(this->data_format_);
+      auto tf_fmt = isConv2D
+                        ? TFDataFormatToMklDnnDataFormat(this->data_format_)
+                        : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
+
       auto fwd_src_md =
           src_mkl_shape.IsMklTensor()
               ? src_mkl_shape.GetMklLayout()
@@ -833,21 +835,19 @@
       if (biasEnabled) {
         TensorShape obp_tf_shape = GetTfShape(context, 2);
         depth = (this->data_format_ == FORMAT_NCHW)
-              ? obp_tf_shape.dim_size(1)
-              : obp_tf_shape.dim_size(3);
+                    ? obp_tf_shape.dim_size(1)
+                    : obp_tf_shape.dim_size(isConv2D ? 3 : 4);
         diff_bias_dims = {static_cast<int>(depth)};
       }
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
-
-      MklConv2DBwdFilterPrimitive<T> *conv2d_bwd_filter = nullptr;
+      MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
       MklConvBwdFilterParams convBwdFilterDims(fwd_src_dims, fwd_filter_dims,
           diff_bias_dims, diff_dst_dims, strides, dilations, padding_left,
           padding_right, TFPaddingToMklDnnPadding(this->padding_));
-      conv2d_bwd_filter = MklConv2DBwdFilterPrimitiveFactory<T>::Get(
-          convBwdFilterDims);
-      auto bwd_filter_pd = conv2d_bwd_filter->GetPrimitiveDesc();
+      conv_bwd_filter =
+          MklConvBwdFilterPrimitiveFactory<T>::Get(convBwdFilterDims);
+      auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc();
 
       // allocate output tensors: diff_fitler and diff_bias (w bias)
       auto bwd_output_dims = GetOutputDims(fwd_src_dims, fwd_filter_dims);
@@ -855,14 +855,26 @@
       // diff_filter
       MklDnnShape diff_filter_mkl_shape;
       diff_filter_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      TensorShape diff_filter_tf_shape(
-                    {bwd_output_dims[MklDnnDims::Dim_H],
-                     bwd_output_dims[MklDnnDims::Dim_W],
-                     bwd_output_dims[MklDnnDims::Dim_I],
-                     bwd_output_dims[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                diff_filter_tf_shape, diff_filter_mkl_shape);
+
+      if (isConv2D) {
+        // Conv2D: output_dims_mkl_order is in OIHW format.
+        TensorShape diff_filter_tf_shape({bwd_output_dims[MklDnnDims::Dim_H],
+                                          bwd_output_dims[MklDnnDims::Dim_W],
+                                          bwd_output_dims[MklDnnDims::Dim_I],
+                                          bwd_output_dims[MklDnnDims::Dim_O]});
+        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      } else {
+        // Conv3D: output_dims_mkl_order is in OIDHW format.
+        TensorShape diff_filter_tf_shape(
+            {bwd_output_dims[MklDnnDims3D::Dim3d_D],
+             bwd_output_dims[MklDnnDims3D::Dim3d_H],
+             bwd_output_dims[MklDnnDims3D::Dim3d_W],
+             bwd_output_dims[MklDnnDims3D::Dim3d_I],
+             bwd_output_dims[MklDnnDims3D::Dim3d_O]});
+        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      }
 
       Tensor* diff_bias_tensor = nullptr;
       if (biasEnabled) {
@@ -872,7 +884,7 @@
 
       // check if src and diff_dst need reorder
       T *src_data = nullptr;
-      if (fwd_src_md.data.format != conv2d_bwd_filter->GetSrcMemoryFormat()) {
+      if (fwd_src_md.data.format != conv_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
         src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
@@ -883,7 +895,7 @@
 
       T *diff_dst_data = nullptr;
       if (diff_dst_md.data.format !=
-          conv2d_bwd_filter->GetDiffDstMemoryFormat()) {
+          conv_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
@@ -898,7 +910,7 @@
       bool diff_filter_reorder_required = false;
       T *diff_filter_data = nullptr;
       if (GetOutputFormat(tf_fmt) !=
-          conv2d_bwd_filter->GetDiffFilterMemoryFormat()) {
+          conv_bwd_filter->GetDiffFilterMemoryFormat()) {
         // Allocate diff filter tensor as Tensorflow layout
         diff_filter.SetUsrMem(bwd_output_dims, GetOutputFormat(tf_fmt),
                               diff_filter_tensor);
@@ -916,10 +928,10 @@
       if (biasEnabled) {
         T* diff_bias_data = static_cast<T*>(const_cast<T*>(
                          diff_bias_tensor->flat<T>().data()));
-        conv2d_bwd_filter->Execute(src_data, diff_filter_data,
-                               diff_bias_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
+                                 diff_dst_data);
       } else {
-        conv2d_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
       }
 
       // Reorder diff_filter back to Tensorflow layout if necessary
@@ -948,7 +960,7 @@
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "ConvBackpropFilter: filter should not be in MKL Layout";
   }
 
   // Get TensorFlow shape of input tensor.
@@ -984,9 +996,11 @@
     return fwd_filter_dims;
   }
 
-  // Output layout is Tensorflow's filter layout (HWIO).
+  // Output layout is Tensorflow's filter layout
+  //   Conv2D: HWIO;  Conv3D: DHWIO
   memory::format GetOutputFormat(const memory::format data_format) {
-    return memory::format::hwio;
+    return (this->strides_.size() == 4) ? memory::format::hwio
+                                        : memory::format::dhwio;
   }
 
   // Allocate output tensor.
@@ -1028,29 +1042,32 @@
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklConv2DBackpropFilter")                                   \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<T>("T")                                        \
-          .Label(mkl_op_registry::kMklOpLabel),                          \
-      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklConv2DBackpropFilterWithBias")                           \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<T>("T")                                        \
-          .Label(mkl_op_registry::kMklOpLabel),                          \
-      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<T>("T")                    \
-                              .Label(mkl_op_registry::kMklOpLabel),      \
-                          MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")                     \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")             \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, true>);  \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")       \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklDummyOp<CPUDevice, T>);                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropFilterV2")                   \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 39498f1..b5a9830 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,7 +23,7 @@
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #endif
@@ -46,7 +46,7 @@
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_data;
@@ -57,9 +57,9 @@
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
-/// utility classes enabling primitive reuse for backward conv2d ops.
+/// utility classes enabling primitive reuse for backward conv ops.
 struct MklConvBwdInputParams {
   memory::dims diff_src_dims;
   memory::dims filter_dims;
@@ -83,11 +83,11 @@
 };
 
 template <typename T>
-class MklConv2DBwdInputPrimitive : public MklPrimitive {
+class MklConvBwdInputPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DBwdInputPrimitive(
-      const MklConvBwdInputParams& convBwdInputDims) :
-           cpu_engine_(engine::cpu, 0) {
+  explicit MklConvBwdInputPrimitive(
+      const MklConvBwdInputParams& convBwdInputDims)
+      : cpu_engine_(engine::cpu, 0) {
     context_.bwd_input_stream.reset(new stream(stream::kind::eager));
 
     // create conv primitive
@@ -95,7 +95,7 @@
       Setup(convBwdInputDims);
     }
   }
-  ~MklConv2DBwdInputPrimitive() {}
+  ~MklConvBwdInputPrimitive() {}
 
   // Convolution backward filter (weights)
   //   diff_src_data: output data buffer of diff_src
@@ -134,7 +134,7 @@
   }
 
  private:
-  // Primitive reuse context for Conv2D Bwd Input op
+  // Primitive reuse context for Conv Bwd Input op
   struct ConvBwdInputContext {
     // expected memory format for this primitive instance
     memory::format filter_fmt;
@@ -235,38 +235,37 @@
 };
 
 template <typename T>
-class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
  private:
-  MklConv2DBwdInputPrimitiveFactory() {}
-  ~MklConv2DBwdInputPrimitiveFactory() {}
+  MklConvBwdInputPrimitiveFactory() {}
+  ~MklConvBwdInputPrimitiveFactory() {}
 
  public:
-  static MklConv2DBwdInputPrimitive<T>* Get(
+  static MklConvBwdInputPrimitive<T>* Get(
       const MklConvBwdInputParams& convBwdInputDims) {
-    MklConv2DBwdInputPrimitive<T>* conv2d_bwd_input = nullptr;
+    MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
 
     // look into the pool for reusable primitive
-    conv2d_bwd_input = dynamic_cast<MklConv2DBwdInputPrimitive<T>*> (
-        MklConv2DBwdInputPrimitiveFactory<T>::GetInstance().GetConv2dBwdInput(
+    conv_bwd_input = dynamic_cast<MklConvBwdInputPrimitive<T>*>(
+        MklConvBwdInputPrimitiveFactory<T>::GetInstance().GetConvBwdInput(
             convBwdInputDims));
 
-    if (conv2d_bwd_input == nullptr) {
-      conv2d_bwd_input = new MklConv2DBwdInputPrimitive<T>(
-          convBwdInputDims);
-      MklConv2DBwdInputPrimitiveFactory<T>::GetInstance().SetConv2dBwdInput(
-          convBwdInputDims, conv2d_bwd_input);
+    if (conv_bwd_input == nullptr) {
+      conv_bwd_input = new MklConvBwdInputPrimitive<T>(convBwdInputDims);
+      MklConvBwdInputPrimitiveFactory<T>::GetInstance().SetConvBwdInput(
+          convBwdInputDims, conv_bwd_input);
     }
-    return conv2d_bwd_input;
+    return conv_bwd_input;
   }
 
  private:
-  static MklConv2DBwdInputPrimitiveFactory& GetInstance() {
-    static MklConv2DBwdInputPrimitiveFactory instance_;
+  static MklConvBwdInputPrimitiveFactory& GetInstance() {
+    static MklConvBwdInputPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvBwdInputParams& convBwdInputDims) {
-    string prefix = "conv2d_bwd_input";
+    string prefix = "conv_bwd_input";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdInputDims.diff_src_dims);
@@ -279,14 +278,13 @@
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2dBwdInput(
-      const MklConvBwdInputParams& convBwdInputDims) {
+  MklPrimitive* GetConvBwdInput(const MklConvBwdInputParams& convBwdInputDims) {
     string key = CreateKey(convBwdInputDims);
     return this->GetOp(key);
   }
 
-  void SetConv2dBwdInput(
-      const MklConvBwdInputParams& convBwdInputDims, MklPrimitive *op) {
+  void SetConvBwdInput(const MklConvBwdInputParams& convBwdInputDims,
+                       MklPrimitive* op) {
     string key = CreateKey(convBwdInputDims);
     this->SetOp(key, op);
   }
@@ -294,7 +292,7 @@
 
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -594,23 +592,34 @@
   TensorFormat data_format;
 };
 
+#define REGISTER_MKL_CPU_KERNELS(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
+#undef REGISTER_MKL_CPU_KERNELS
+
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp
-    : public MklConv2DBackpropCommonOp<Device, T> {
+class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
  public:
-  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) {
-  }
+  explicit MklConvCustomBackpropInputOp(OpKernelConstruction* context)
+      : MklConvBackpropCommonOp<Device, T>(context) {}
 
-  ~MklConv2DCustomBackpropInputOp() {}
+  ~MklConvCustomBackpropInputOp() {}
 
   void Compute(OpKernelContext* context) {
     try {
       MklDnnData<T> filter(&cpu_engine);
       MklDnnData<T> diff_dst(&cpu_engine);
 
+      // This flag indicate Conv2D or Conv3D
+      bool isConv2D = (this->strides_.size() == 4);
+
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
       const Tensor& src_tensor = MklGetInput(context, kInputIdx);
@@ -626,7 +635,7 @@
                         diff_dst_mkl_shape);
 
       // Allow operator-specific generation of shapes.
-      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // E.g., ConvBackpropFilter gets filter as filter_sizes. It is a
       // tensor containing shape of filter. So filter.shape() is not
       // a correct way to get filter shape. These operator-specific calls
       // allow this class to handle this case.
@@ -655,6 +664,7 @@
         }
         return;
       }
+
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with postfix tf_order.
       memory::dims diff_dst_dims, fwd_src_dims, fwd_filter_dims;
@@ -673,15 +683,18 @@
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(this->data_format_);
+      auto tf_fmt = isConv2D
+                        ? TFDataFormatToMklDnnDataFormat(this->data_format_)
+                        : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
       // If filter is in MKL layout, then simply grab filter layout;
       // otherwise, construct filter in TF layout.
       // For TF layout, filter is in HWIO format.
       auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
-                         ? filter_mkl_shape.GetMklLayout()
-                         : memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              isConv2D ? memory::format::hwio
+                                                       : memory::format::dhwio);
 
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
@@ -689,18 +702,15 @@
                        ? diff_dst_mkl_shape.GetMklLayout()
                        : memory::desc(diff_dst_dims,
                            MklDnnType<T>(), tf_fmt);
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
-
-      MklConv2DBwdInputPrimitive<T> *conv2d_bwd_input = nullptr;
-      conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
+      MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
       MklConvBwdInputParams convBwdInputDims(fwd_src_dims, fwd_filter_dims,
           diff_dst_dims, strides, dilations, padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
-      conv2d_bwd_input = MklConv2DBwdInputPrimitiveFactory<T>::Get(
-          convBwdInputDims);
-      auto bwd_input_pd = conv2d_bwd_input->GetPrimitiveDesc();
+      conv_bwd_input =
+          MklConvBwdInputPrimitiveFactory<T>::Get(convBwdInputDims);
+      auto bwd_input_pd = conv_bwd_input->GetPrimitiveDesc();
 
       // allocate output tensor
       auto diff_src_pd = bwd_input_pd->diff_src_primitive_desc();
@@ -723,7 +733,7 @@
       // check if filter and diff_dst need reorder
       T* filter_data = nullptr;
       if (fwd_filter_md.data.format !=
-          conv2d_bwd_input->GetFilterMemoryFormat()) {
+          conv_bwd_input->GetFilterMemoryFormat()) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
@@ -733,8 +743,7 @@
       }
 
       T* diff_dst_data = nullptr;
-      if (diff_dst_md.data.format !=
-          conv2d_bwd_input->GetDiffDstMemoryFormat()) {
+      if (diff_dst_md.data.format != conv_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
@@ -745,7 +754,7 @@
       }
 
       // execute convolution input bwd
-      conv2d_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+      conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -770,7 +779,7 @@
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-        << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "ConvBackpropInput: input should not be in MKL Layout";
   }
 
   // Get TensorFlow shape of input tensor.
@@ -778,10 +787,10 @@
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(
-        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
-            .ok(),
-        true);
+    // Conv[2D|3D]BackpropInputV2 supports both DT_INT32 and DT_INT64
+    // output_shape MakeShape is able to handle both DT_INT32 and DT_INT64 for
+    // input_tensor.
+    CHECK_EQ(this->MakeShape(input_tensor, &input_tf_shape).ok(), true);
     return input_tf_shape;
   }
 
@@ -792,7 +801,7 @@
   }
 
   // Get the Tensorflow shape of Output (diff_src),
-  // which is same as shape of Conv2D 'input'.
+  // which is same as shape of Conv 'input'.
   TensorShape GetOutputTfShape(const TensorShape& input_shape,
                                const TensorShape& filter_shape,
                                const TensorShape& outbprop_shape) {
@@ -800,7 +809,7 @@
   }
 
   // Get the Tensorflow shape of Output (diff_src),
-  // which is same as shape of Conv2D 'input'.
+  // which is same as shape of Conv 'input'.
   const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
                                     const memory::dims& fwd_filter_dims) {
     return fwd_input_dims;
@@ -839,17 +848,22 @@
   }
 };
 
-#endif  // INTEL_MKL_ML
-
-#define REGISTER_MKL_CPU_KERNELS(T)                                 \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+#define REGISTER_MKL_CPU_KERNELS(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")              \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
+                          MklConvCustomBackpropInputOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")            \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
+                          MklConvCustomBackpropInputOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
+#endif  // INTEL_MKL_ML_ONLY
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 62396ee..c6295c7 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -42,7 +42,7 @@
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -57,7 +57,7 @@
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 // This structure aggregates multiple inputs to Conv2DFwd* methods.
 struct MklConvFwdParams {
@@ -85,9 +85,9 @@
 };
 
 template <typename T>
-class MklConv2DFwdPrimitive : public MklPrimitive {
+class MklConvFwdPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims)
+  explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
       : cpu_engine_(engine::cpu, 0) {
     context_.fwd_stream.reset(new stream(stream::kind::eager));
     // create conv primitive
@@ -96,7 +96,7 @@
     }
   }
 
-  ~MklConv2DFwdPrimitive() {}
+  ~MklConvFwdPrimitive() {}
 
   // Convolution forward execute with bias
   //   src_data:    input data buffer of src
@@ -269,37 +269,36 @@
 };
 
 template <typename T>
-class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConv2DFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
-    MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+  static MklConvFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
+    MklConvFwdPrimitive<T>* conv_fwd = nullptr;
 
     // try to find a suitable one in pool
-    conv2d_fwd = dynamic_cast<MklConv2DFwdPrimitive<T>*>(
-        MklConv2DFwdPrimitiveFactory<T>::GetInstance().GetConv2DFwd(
-            convFwdDims));
+    conv_fwd = dynamic_cast<MklConvFwdPrimitive<T>*>(
+        MklConvFwdPrimitiveFactory<T>::GetInstance().GetConvFwd(convFwdDims));
 
-    if (conv2d_fwd == nullptr) {
-      conv2d_fwd = new MklConv2DFwdPrimitive<T>(convFwdDims);
-      MklConv2DFwdPrimitiveFactory<T>::GetInstance().SetConv2DFwd(convFwdDims,
-                                                                  conv2d_fwd);
+    if (conv_fwd == nullptr) {
+      conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
+      MklConvFwdPrimitiveFactory<T>::GetInstance().SetConvFwd(convFwdDims,
+                                                              conv_fwd);
     }
-    return conv2d_fwd;
+    return conv_fwd;
   }
 
  private:
-  MklConv2DFwdPrimitiveFactory() {}
-  ~MklConv2DFwdPrimitiveFactory() {}
+  MklConvFwdPrimitiveFactory() {}
+  ~MklConvFwdPrimitiveFactory() {}
 
   static const int kDilationH = 0, kDilationW = 1;
 
-  static MklConv2DFwdPrimitiveFactory& GetInstance() {
-    static MklConv2DFwdPrimitiveFactory instance_;
+  static MklConvFwdPrimitiveFactory& GetInstance() {
+    static MklConvFwdPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvFwdParams& convFwdDims) {
-    string prefix = "conv2d_fwd_";
+    string prefix = "conv_fwd_";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convFwdDims.src_dims);
@@ -313,12 +312,12 @@
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
+  MklPrimitive* GetConvFwd(const MklConvFwdParams& convFwdDims) {
     string key = CreateKey(convFwdDims);
     return this->GetOp(key);
   }
 
-  void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
+  void SetConvFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
     string key = CreateKey(convFwdDims);
     this->SetOp(key, op);
   }
@@ -329,13 +328,13 @@
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 template <typename Device, typename T, bool biasEnabled>
-class MklConv2DOp : public OpKernel {
+class MklConvOp : public OpKernel {
  public:
-  ~MklConv2DOp() {}
+  ~MklConvOp() {}
 
-  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -755,21 +754,22 @@
 
 #else
 
+// Base class for convolution forward operations
 template <typename Device, typename T, bool biasEnabled>
-class MklConv2DOp : public OpKernel {
+class MklConvOp : public OpKernel {
  public:
-  ~MklConv2DOp() {}
+  ~MklConvOp() {}
 
-  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, strides_.size() == 4,
+    OP_REQUIRES(context, (strides_.size() == 4 || strides_.size() == 5),
                 errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+                                        "specify 4 or 5 dimensions"));
 
     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
@@ -778,20 +778,39 @@
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context, dilation_h > 0 && dilation_w > 0,
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+    if (strides_.size() == 4) {
+      OP_REQUIRES(context, dilations_.size() == 4,
+                  errors::InvalidArgument("Sliding window dilations field must "
+                                          "specify 4 dimensions"));
+      const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+      const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+      const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+      const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+      OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context, dilation_h > 0 && dilation_w > 0,
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    } else if (strides_.size() == 5) {
+      OP_REQUIRES(context, dilations_.size() == 5,
+                  errors::InvalidArgument("Dilation rates field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(dilations_, data_format_, 'N') == 1 &&
+                   GetTensorDim(dilations_, data_format_, 'C') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations rates in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(dilations_, data_format_, '0') > 0 &&
+           GetTensorDim(dilations_, data_format_, '1') > 0 &&
+           GetTensorDim(dilations_, data_format_, '2') > 0),
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -837,7 +856,8 @@
         AllocateOutputSetMklShape(context, kOutputIndex_Dst,
                     &dst_tensor, src_tf_shape, dst_mkl_shape);
 
-        // MklConv2D also outputs converted filter as 2nd output of Conv2D.
+        // MklConv2D/3D also outputs converted filter
+        // as 2nd output of Conv2D/3D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
@@ -846,15 +866,20 @@
         return;
       }
 
+      bool isConv2D = (strides_.size() == 4);
+
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      auto tf_fmt = isConv2D ? TFDataFormatToMklDnnDataFormat(data_format_)
+                             : TFDataFormatToMklDnn3DDataFormat(data_format_);
 
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
-      // layout (NHWC or NCHW depending on data format).
+      // layout depending on data format:
+      //     Conv2D: NHWC or NCHW
+      //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
@@ -864,31 +889,30 @@
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
                            ? filter_mkl_shape.GetMklLayout()
                            : memory::desc(filter_dims, MklDnnType<T>(),
-                                          memory::format::hwio);
-
+                                          isConv2D ? memory::format::hwio
+                                                   : memory::format::dhwio);
       // MKLDNN dilation starts from 0.
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       // get a conv2d fwd from primitive pool
-      MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+      MklConvFwdPrimitive<T>* conv_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
+        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(convFwdDims);
       } else {
         MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
+        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(convFwdDims);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
-          conv2d_fwd->GetPrimitiveDesc();
+          conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd,
                        dst_dims_mkl_order, tf_fmt, &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
@@ -900,7 +924,7 @@
 
       // check whether src/filter need reorder
       T *src_data = nullptr;
-      if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) {
+      if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
@@ -908,7 +932,7 @@
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
       }
       T* filter_data = nullptr;
-      if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) {
+      if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
                                    filter.GetTensorBuffer(filter_out_tensor));
@@ -918,16 +942,15 @@
             static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
       }
 
-
       // execute convolution
       if (biasEnabled) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         T* bias_data = static_cast<T*>(const_cast<T*>(
             bias_tensor.flat<T>().data()));
 
-        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
+        conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-        conv2d_fwd->Execute(src_data, filter_data, dst_data);
+        conv_fwd->Execute(src_data, filter_data, dst_data);
       }
     } catch (mkldnn::error &e) {
       string error_msg = tensorflow::strings::StrCat(
@@ -1038,17 +1061,18 @@
 
 #endif
 
+// Register 2D operations
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, false>);        \
+                          MklConvOp<CPUDevice, T, false>);          \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);         \
+                          MklConvOp<CPUDevice, T, true>);           \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
@@ -1057,5 +1081,14 @@
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
+// Register 3D operations
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3D")                        \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConvOp<CPUDevice, T, false>);
+TF_CALL_float(REGISTER_MKL_CPU);
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 3f154ff..01cc606 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -40,7 +40,7 @@
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -52,7 +52,7 @@
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 class MklDnnConvUtil {
  protected:
@@ -79,9 +79,16 @@
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
-    int stride_rows = GetTensorDim(strides_, data_format_, 'H');
-    int stride_cols = GetTensorDim(strides_, data_format_, 'W');
-    *strides = {stride_rows, stride_cols};
+    if (strides_.size() == 4) {
+      int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+      int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+      *strides = {stride_rows, stride_cols};
+    } else if (strides_.size() == 5) {
+      int stride_planes = GetTensorDim(strides_, data_format_, '0');
+      int stride_rows = GetTensorDim(strides_, data_format_, '1');
+      int stride_cols = GetTensorDim(strides_, data_format_, '2');
+      *strides = {stride_planes, stride_rows, stride_cols};
+    }
   }
 
   // Calculate Convolution dilations
@@ -89,13 +96,20 @@
     // For now we take the dilation from the second and third dimensions only
     // (we do not support dilation on the batch or depth dimension).
     CHECK_NOTNULL(dilations);
-    int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
-    int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
-    *dilations = {dilations_rows, dilations_cols};
+    if (dilations_.size() == 4) {
+      int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
+      *dilations = {dilations_rows, dilations_cols};
+    } else if (dilations_.size() == 5) {
+      int dilations_planes = GetTensorDim(dilations_, data_format_, '0');
+      int dilations_rows = GetTensorDim(dilations_, data_format_, '1');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, '2');
+      *dilations = {dilations_planes, dilations_rows, dilations_cols};
+    }
   }
 
   // Calculate Convolution input size in MKL-DNN order. MKL-DNN
-  // requires input in NCHW format. Function does not return anything.
+  // requires input in NCHW/NCDHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
   virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
@@ -113,40 +127,62 @@
     int64 input_depth_raw = GetTensorDim(input_shape, data_format_, 'C');
     int input_depth = static_cast<int>(input_depth_raw);
 
-    // Input rows/height
-    int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
-    CHECK_BOUNDS(input_rows_raw, "Input rows too large");
-    int input_rows = static_cast<int>(input_rows_raw);
-
-    // Input columns/width
-    int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
-    CHECK_BOUNDS(input_cols_raw, "Input cols too large");
-    int input_cols = static_cast<int>(input_cols_raw);
-
     // Input batch
     int64 input_batch_raw = GetTensorDim(input_shape, data_format_, 'N');
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
+    if (strides_.size() == 4) {  // NCHW format for Conv2D
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // MKL-DNN always requires input in NCHW format Conv2D.
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+      mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+      mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+      *input_dims = mkldnn_sizes;
+    } else if (strides_.size() == 5) {  // NCDHW format for Conv3D
+      // Input planes/third-dimension
+      int64 input_planes_raw = GetTensorDim(input_shape, data_format_, '0');
+      CHECK_BOUNDS(input_planes_raw, "Input depth too large");
+      int input_planes = static_cast<int>(input_planes_raw);
+
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, '1');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, '2');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // MKL-DNN always requires input in NCDHW format for Conv3D.
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_N] = input_batch;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_C] = input_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = input_planes;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = input_rows;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = input_cols;
+
+      *input_dims = mkldnn_sizes;
+    }
 #undef CHECK_BOUNDS
-
-    // MKL-DNN always requires input in NCHW format.
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
-    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
-    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
-
-    *input_dims = mkldnn_sizes;
   }
 
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
-  // But errors arising from sanity checks are returned in context's
-  // status.
-  //
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
+  // Calculate Convolution filter size in MKL-DNN order.
+  // MKL-DNN requires filter in OIHW (Conv2D) or OIDHW (Conv3D) format.
+  // Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status. This function differs from GetConvFilterSizeInMklOrder in
   // parameter for input - it accepts src_shape since Convolution Backward
@@ -159,11 +195,13 @@
                                               memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
-    OP_REQUIRES(context_, filter_shape.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
+    OP_REQUIRES(context_, filter_shape.dims() == strides_.size(),
+                errors::InvalidArgument((strides_.size() == 4)
+                                            ? "filter must be 4-dimensional: "
+                                            : "filter must be 5-dimensional: ",
                                         filter_shape.DebugString()));
 
-    for (int i = 0; i < 3; i++) {
+    for (int i = 0; i < ((strides_.size() == 4) ? 3 : 5); i++) {
       OP_REQUIRES(context_,
                   FastBoundsCheck(filter_shape.dim_size(i),
                                   std::numeric_limits<int>::max()),
@@ -172,32 +210,57 @@
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter_shape.dim_size(2)));
+    if (strides_.size() == 4) {  // Conv2D
+      OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                  errors::InvalidArgument(
+                      "input and filter must have the same depth: ",
+                      input_depth, " vs ", filter_shape.dim_size(2)));
 
-    // TF filter is always in (rows, cols, in_depth, out_depth) order.
-    int filter_rows = static_cast<int>(filter_shape.dim_size(0));
-    int filter_cols = static_cast<int>(filter_shape.dim_size(1));
-    int in_depth = static_cast<int>(filter_shape.dim_size(2));
-    int out_depth = static_cast<int>(filter_shape.dim_size(3));
+      // TF filter is always in (rows, cols, in_depth, out_depth) order.
+      int filter_rows = static_cast<int>(filter_shape.dim_size(0));
+      int filter_cols = static_cast<int>(filter_shape.dim_size(1));
+      int in_depth = static_cast<int>(filter_shape.dim_size(2));
+      int out_depth = static_cast<int>(filter_shape.dim_size(3));
 
-    // MKL-DNN always needs filter in OIHW format.
-    // OIHW = (out_depth, in_depth, rows, cols)
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
-    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
-    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+      // MKL-DNN always needs filter in OIHW format.
+      // OIHW = (out_depth, in_depth, rows, cols)
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+      mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+      mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
 
-    *filter_dims = mkldnn_sizes;
+      *filter_dims = mkldnn_sizes;
+    } else {  // Conv3D
+      OP_REQUIRES(context_, input_depth == filter_shape.dim_size(3),
+                  errors::InvalidArgument(
+                      "input and filter must have the same depth: ",
+                      input_depth, " vs ", filter_shape.dim_size(3)));
+
+      // TF filter is always in (planes, rows, cols, in_depth, out_depth) order.
+      int filter_planes = static_cast<int>(filter_shape.dim_size(0));
+      int filter_rows = static_cast<int>(filter_shape.dim_size(1));
+      int filter_cols = static_cast<int>(filter_shape.dim_size(2));
+      int in_depth = static_cast<int>(filter_shape.dim_size(3));
+      int out_depth = static_cast<int>(filter_shape.dim_size(4));
+
+      // MKL-DNN always needs filter in OIDHW format.
+      // OIDHW = (out_depth, in_depth, planes, rows, cols)
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = in_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = filter_rows;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = filter_cols;
+
+      *filter_dims = mkldnn_sizes;
+    }
   }
 
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
-  // But errors arising from sanity checks are returned in context's
-  // status.
+  // Calculate Convolution filter size in MKL-DNN order.
+  // MKL-DNN requires filter in OIHW (Conv2D) or OIDHW(Conv3D format.
+  // Function does not return anything. But errors arising from sanity
+  // checks are returned in context's status.
   virtual inline void GetFilterSizeInMklOrder(size_t src_index,
                                               size_t filter_index,
                                               memory::dims* filter_dims) {
@@ -206,8 +269,8 @@
                             GetTfShape(context_, filter_index), filter_dims);
   }
 
-  // Calculate Bias size for 2D Convolution. Function does not return
-  // anything, but sets error in context status.
+  // Calculate Bias size for 2D or 3D Convolution. Function does not
+  // return anything, but may set an error in context status.
   virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
                                             memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
@@ -218,82 +281,19 @@
     *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
-  // Function to calculate output and padding size for 2D convolution.
+  // Function to calculate output and padding size for 2D/3D convolution.
   //
   // Calculate output shape of Convolution in MKL-DNN and TensorFlow order.
-  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
-  // NHWC or NCHW format depending on data format. Function also calculates
-  // left, right, top and bottom pads. Function does not return any status -
-  // status is returned via context status.
+  // MKL-DNN uses NCHW(Conv2D) or NCDHW(Conv3D) for output order.
+  // But TensorFlow output will be in NHWC||NCHW(Conv2D) or
+  // NDHWC||NCDHW(Conv3D) format depending on data format.
+  // Function also calculates left, right, top and bottom pads.
+  // Function does not return any status which is set with context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
   virtual inline void GetOutputAndPadSizeInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
-      memory::dims* output_dims_tf_order,
-      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
-    CHECK_NOTNULL(output_dims_tf_order);
-    CHECK_NOTNULL(output_dims_mkl_order);
-    CHECK_NOTNULL(pad_l);
-    CHECK_NOTNULL(pad_r);
-
-    int input_rows = GetTensorDim(input_shape, data_format_, 'H');
-    int input_cols = GetTensorDim(input_shape, data_format_, 'W');
-
-    // The first dimension for filter is rows/height.
-    int filter_rows = filter_shape.dim_size(0);
-    // The second dimension for filter is cols/width.
-    int filter_cols = filter_shape.dim_size(1);
-
-    // Stride is vector of 2 elements: {s_r, s_c}
-    int stride_rows = strides[0];
-    int stride_cols = strides[1];
-    int dilation_rows = dilations[0];
-    int dilation_cols = dilations[1];
-
-    // Output batch is same as input batch.
-    int out_batch = GetTensorDim(input_shape, data_format_, 'N');
-    // Output depth is same as last dimension for filter.
-    int out_depth = filter_shape.dim_size(3);
-
-    int64 out_rows = 0, out_cols = 0;
-    int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
-
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerboseV2(input_rows, filter_rows,
-                                 dilation_rows, stride_rows, padding_,
-                                 &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerboseV2(input_cols, filter_cols,
-                                 dilation_cols, stride_cols, padding_,
-                                 &out_cols, &pad_left, &pad_right));
-
-    // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
-    *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
-
-    // MKL-DNN always needs output in NCHW format.
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
-    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
-    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
-    *output_dims_mkl_order = mkldnn_sizes;
-
-    // Now handle padding. MKL-DNN uses asymetric padding.
-    *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
-    *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
-  }
-
-  // Calculate output and pad size of forward Convolution operator.
-  // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
-  //
-  // Function does not return anything, but sets error in context status.
-  inline void GetOutputAndPadSizeInMklOrder(
-      size_t src_index, size_t filter_index,
-      const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
       memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
@@ -301,12 +301,152 @@
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
+    bool isConv2D = (strides_.size() == 4);
+    int input_planes, input_rows, input_cols;
+    if (isConv2D) {
+      input_rows = GetTensorDim(input_shape, data_format_, 'H');
+      input_cols = GetTensorDim(input_shape, data_format_, 'W');
+    } else {
+      input_planes = GetTensorDim(input_shape, data_format_, '0');
+      input_rows = GetTensorDim(input_shape, data_format_, '1');
+      input_cols = GetTensorDim(input_shape, data_format_, '2');
+    }
+
+    // Filter dimension
+    // Conv2D:
+    //    First dimension: rows/height.
+    //    Second dimension: cols/width.
+    // Conv3D:
+    //    First dimension: planes/depth.
+    //    Second dimension: rows/height.
+    //    Third dimension: cols/width.
+
+    int filter_planes, filter_rows, filter_cols;
+    if (isConv2D) {
+      filter_rows = filter_shape.dim_size(0);
+      filter_cols = filter_shape.dim_size(1);
+    } else {
+      filter_planes = filter_shape.dim_size(0);
+      filter_rows = filter_shape.dim_size(1);
+      filter_cols = filter_shape.dim_size(2);
+    }
+
+    int stride_planes, stride_rows, stride_cols;
+    int dilation_planes, dilation_rows, dilation_cols;
+    if (isConv2D) {
+      // Conv2D stride is a vector of 2 elements: {s_r, s_c}
+      stride_rows = strides[0];
+      stride_cols = strides[1];
+      dilation_rows = dilations[0];
+      dilation_cols = dilations[1];
+    } else {
+      // Conv3D stride is a vector of 3 elements: {s_d, s_r, s_c}
+      stride_planes = strides[0];
+      stride_rows = strides[1];
+      stride_cols = strides[2];
+      dilation_planes = dilations[0];
+      dilation_rows = dilations[1];
+      dilation_cols = dilations[2];
+    }
+
+    // Output batch is same as input batch.
+    int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+
+    // Output depth is same as last dimension for filter.
+    int out_depth = filter_shape.dim_size(isConv2D ? 3 : 4);
+
+    int64 out_rows = 0, out_cols = 0, out_planes = 0;
+    int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
+    int64 pad_D1, pad_D2;
+
+    if (isConv2D) {
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerboseV2(
+                         input_rows, filter_rows, dilation_rows, stride_rows,
+                         padding_, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerboseV2(
+                         input_cols, filter_cols, dilation_cols, stride_cols,
+                         padding_, &out_cols, &pad_left, &pad_right));
+    } else {
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_planes, filter_planes, stride_planes,
+                                   padding_, &out_planes, &pad_D1, &pad_D2));
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_rows, filter_rows, stride_rows,
+                                   padding_, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_cols, filter_cols, stride_cols,
+                                   padding_, &out_cols, &pad_left, &pad_right));
+    }
+
+    // Tensorflow output is in data_format order.
+    //     Conv2D: NHWC or NCHW
+    //     Conv3D: NDHWC or NCDHW
+    // MKL-DNN uses asymetric padding.
+    TensorShape out_shape =
+        isConv2D
+            ? ShapeFromFormat(data_format_, out_batch, out_rows, out_cols,
+                              out_depth)
+            : ShapeFromFormat(data_format_, out_batch,
+                              {{out_planes, out_rows, out_cols}}, out_depth);
+    *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
+
+    if (isConv2D) {
+      // For Conv2D, MKL-DNN always needs output in NCHW format.
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+      mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+      mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = mkldnn_sizes;
+
+      *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+    } else {
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_C] = out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = static_cast<int>(out_planes);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = mkldnn_sizes;
+
+      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
+                static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
+                static_cast<int>(pad_right)};
+    }
+  }
+
+  // Calculate output and pad size of forward Convolution operator.
+  // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
+  //
+  // Function does not return anything, but sets error in context status.
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      const memory::dims& dilations, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
+    CHECK_NOTNULL(output_dims_tf_order);
+    CHECK_NOTNULL(output_dims_mkl_order);
+    CHECK_NOTNULL(pad_l);
+    CHECK_NOTNULL(pad_r);
+
     auto input_tf_shape = GetTfShape(context_, src_index);
     auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input_tf_shape.DebugString()));
+    if (strides_.size() == 4) {
+      // Conv2D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input_tf_shape.DebugString()));
+    } else {
+      // Conv3D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 5,
+                  errors::InvalidArgument("input must be 5-dimensional",
+                                          input_tf_shape.DebugString()));
+    }
 
     GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
                                   strides, dilations, output_dims_tf_order,
@@ -314,9 +454,11 @@
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
-  // 2D Convolution in MKL order (NCHW for input and output; OIHW for filter.)
-  // Function also calculates output shape in Tensorflow order. Additionally, it
-  // also calculates strides and paddings for 2D Convolution.
+  // Conv2D/Conv3D in MKL order:
+  //     Conv2D: NCHW for input and output; OIHW for filter.
+  //     Conv3D: NCDHW for input and output; OIDHW for filter.
+  // Function also calculates output shape in Tensorflow order.
+  // Additionally, it also calculates strides and paddings.
   //
   // Function does not return anything, but sets error in context status.
   inline void GetConvFwdSizesInMklOrder(
@@ -349,16 +491,15 @@
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////
-///  Common class that implements Conv2DBackpropFilter and Input
+///  Common class that implements ConvBackpropFilter and Input
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp : public OpKernel {
+class MklConvBackpropCommonOp : public OpKernel {
  public:
-  ~MklConv2DBackpropCommonOp() {}
-  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+  ~MklConvBackpropCommonOp() {}
+  explicit MklConvBackpropCommonOp(OpKernelConstruction* context)
       : OpKernel(context) {
     string data_format_str;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
@@ -372,20 +513,25 @@
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context, dilation_h > 0 && dilation_w > 0,
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+    if (strides_.size() == 4) {
+      // Check Conv2D dilations
+      OP_REQUIRES(context, dilations_.size() == 4,
+                  errors::InvalidArgument("Sliding window dilations field must "
+                                          "specify 4 dimensions"));
+      int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+      int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+      int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+      int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+      OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context, dilation_h > 0 && dilation_w > 0,
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    }
+
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
@@ -397,8 +543,7 @@
   TensorFormat data_format_;  // NCHW or NHWC
 };
 
-#endif  // INTEL_MKL_ML
-
+#endif  // INTEL_MKL_ML_ONLY
 
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 0149e78..2ec6c8f 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -21,8 +21,7 @@
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::batch_normalization_backward;
 using mkldnn::batch_normalization_forward;
@@ -41,7 +40,7 @@
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -684,7 +683,7 @@
 };
 #endif
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
@@ -899,8 +898,8 @@
   MklFusedBatchNormFwdPrimitiveFactory() {}
   ~MklFusedBatchNormFwdPrimitiveFactory() {}
 
-  static std::string CreateKey(const MklBatchNormFwdParams& fwdParams) {
-    std::string prefix = "bn_fwd";
+  static string CreateKey(const MklBatchNormFwdParams& fwdParams) {
+    string prefix = "bn_fwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(fwdParams.src_dims);
@@ -911,13 +910,13 @@
   }
 
   MklPrimitive* GetBatchNormFwd(const MklBatchNormFwdParams& fwdParams) {
-    std::string key = CreateKey(fwdParams);
+    string key = CreateKey(fwdParams);
     return this->GetOp(key);
   }
 
   void SetBatchNormFwd(const MklBatchNormFwdParams& fwdParams,
                        MklPrimitive* op) {
-    std::string key = CreateKey(fwdParams);
+    string key = CreateKey(fwdParams);
     this->SetOp(key, op);
   }
 };
@@ -1122,8 +1121,8 @@
   MklFusedBatchNormBwdPrimitiveFactory() {}
   ~MklFusedBatchNormBwdPrimitiveFactory() {}
 
-  static std::string CreateKey(const MklBatchNormBwdParams& bwdParams) {
-    std::string prefix = "bn_bwd";
+  static string CreateKey(const MklBatchNormBwdParams& bwdParams) {
+    string prefix = "bn_bwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(bwdParams.src_dims);
@@ -1135,13 +1134,13 @@
   }
 
   MklPrimitive* GetBatchNormBwd(const MklBatchNormBwdParams& bwdParams) {
-    std::string key = CreateKey(bwdParams);
+    string key = CreateKey(bwdParams);
     return this->GetOp(key);
   }
 
   void SetBatchNormBwd(const MklBatchNormBwdParams& bwdParams,
                        MklPrimitive* op) {
-    std::string key = CreateKey(bwdParams);
+    string key = CreateKey(bwdParams);
     this->SetOp(key, op);
   }
 };
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index b02cc53..b57e816 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -24,20 +24,20 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #endif
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 #endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index dc4da33..06ce820 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -32,7 +32,7 @@
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::stream;
@@ -60,7 +60,7 @@
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 7966c27..22ff4cd 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -35,7 +35,7 @@
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
@@ -69,7 +69,7 @@
 
 }  // namespace
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename T>
 class MklLRNOp : public OpKernel {
@@ -1345,7 +1345,7 @@
   float beta_;
 };
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 62c0404..077d62c 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -23,14 +23,20 @@
 // and when it is undefined at build time, this file becomes an empty
 // compilation unit
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 
-#include "mkl_cblas.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 
+// This header file is part of MKL ML, need equivalent file in MKL DNN
+#ifndef INTEL_MKL_DNN_ONLY
+#include "mkl_cblas.h"
+#else
+#include "mkldnn.h"
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -100,7 +106,6 @@
  private:
   bool transpose_a_;
   bool transpose_b_;
-
   // --------------------------------------------------------------------------
   //
   // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
@@ -150,11 +155,26 @@
     // 1.0 and 0.0 respectively.
     const float alpha = 1.0f;
     const float beta = 0.0f;
+#if defined(INTEL_MKL_DNN_ONLY)
+    const char* const ftrans[] = {"N", "T", "C"};
+    int index_transa = transa ? 1 : 0;
+    int index_transb = transb ? 1 : 0;
+    VLOG(2) << "MKL DNN SGEMM called";
+    // MKL DNN only supports the Fortran api and requires column major while
+    // Tensorflow uses row major so we reverse the order A and B
+    mkldnn_sgemm(ftrans[index_transb], ftrans[index_transa], &n, &m, &k, &alpha,
+                 b, &ldb, a, &lda, &beta, c, &ldc);
+#else
+    // MKL ML binary uses CBLAS API
     cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                 transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
                 ldb, beta, c, ldc);
+#endif
   }
 
+  // MKLDNN only supports SGEMM
+#ifndef INTEL_MKL_DNN_ONLY
+
   // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
   // parameters, look at FP32 function description.
   void MklBlasGemm(bool transa, bool transb, const int m, const int n,
@@ -197,6 +217,7 @@
                 reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
                 reinterpret_cast<MKL_Complex16*>(c), ldc);
   }
+#endif
 };
 
 #define REGISTER_CPU(T)                                         \
@@ -207,9 +228,12 @@
 // TODO(inteltf) Consider template specialization when adding/removing
 // additional types
 TF_CALL_float(REGISTER_CPU);
+
+#ifndef INTEL_MKL_DNN_ONLY
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
+#endif
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 0a21515..e149f00 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include <algorithm>
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
@@ -40,7 +40,7 @@
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // MKL-DNN is now default. MKL-ML must be specified explicitly.
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -817,7 +817,7 @@
   }
 };  // MklMaxPoolingGradOp
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 915878d..d7ad3f9 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -223,7 +223,7 @@
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -253,7 +253,7 @@
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -288,7 +288,7 @@
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 9c516af..ec7af50 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -22,7 +22,7 @@
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::memory;
 using mkldnn::pooling_backward;
@@ -175,8 +175,8 @@
   // primitive op from reuse perspective.
   // A pooling key is a string which concates key parameters
   // as well as algorithm kind (max versus avg).
-  static std::string CreateKey(const MklPoolingParams& fwdParams) {
-    std::string prefix = "pooling_fwd";
+  static string CreateKey(const MklPoolingParams& fwdParams) {
+    string prefix = "pooling_fwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(fwdParams.src_dims);
@@ -190,12 +190,12 @@
   }
 
   MklPrimitive* GetPoolingFwd(const MklPoolingParams& fwdParams) {
-    std::string key = CreateKey(fwdParams);
+    string key = CreateKey(fwdParams);
     return this->GetOp(key);
   }
 
   void SetPoolingFwd(const MklPoolingParams& fwdParams, MklPrimitive* op) {
-    std::string key = CreateKey(fwdParams);
+    string key = CreateKey(fwdParams);
     this->SetOp(key, op);
   }
 };
@@ -326,8 +326,8 @@
   // primitive op from reuse perspective.
   // A pooling key is a string which concates key parameters
   // as well as algorithm kind (max versus avg).
-  static std::string CreateKey(const MklPoolingParams& bwdParams) {
-    std::string prefix = "pooling_bwd";
+  static string CreateKey(const MklPoolingParams& bwdParams) {
+    string prefix = "pooling_bwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(bwdParams.src_dims);
@@ -341,12 +341,12 @@
   }
 
   MklPrimitive* GetPoolingBwd(const MklPoolingParams& bwdParams) {
-    std::string key = CreateKey(bwdParams);
+    string key = CreateKey(bwdParams);
     return this->GetOp(key);
   }
 
   void SetPoolingBwd(const MklPoolingParams& bwdParams, MklPrimitive* op) {
-    std::string key = CreateKey(bwdParams);
+    string key = CreateKey(bwdParams);
     this->SetOp(key, op);
   }
 };
@@ -405,7 +405,7 @@
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -422,7 +422,7 @@
             TensorFormat data_format);
 };
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 template <class T>
 class MklPoolingOpBase : public OpKernel {
@@ -674,7 +674,7 @@
     return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 78abbdb..0503489 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -23,8 +23,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 
 using mkldnn::algorithm;
@@ -58,7 +57,7 @@
   }
 };
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
@@ -368,10 +367,7 @@
   mkl_context.MklCleanup();
 }
 
-
-
-#else  // INTEL_MKL_ML
-
+#else  // INTEL_MKL_ML_ONLY
 
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
@@ -874,7 +870,7 @@
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 9c536df..d9a7893 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -24,8 +24,7 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 using mkldnn::stream;
 #else
@@ -42,7 +41,7 @@
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -317,7 +316,7 @@
     }
   }
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 6383929..8bde966 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@
 
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -153,5 +153,5 @@
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index f4f0035..894c2e3 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -32,13 +32,13 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #endif
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 using mkldnn::stream;
 #endif
 
@@ -64,7 +64,7 @@
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
@@ -118,12 +118,11 @@
         CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
-          errors::Aborted("Operation received an exception:", error_msg));
+          errors::Aborted("Operation received an exception: Status: ", e.status,
+                          ", message: ", StringPiece(e.message), ", in file ",
+                          __FILE__, ":", __LINE__));
     }
   }
 #else
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index b180c2f..6bbe271 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -15,13 +15,23 @@
 
 // See docs in ../ops/array_ops.cc.
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
+#if !defined(INTEL_MKL_DNN_ONLY)
 #include "mkl_trans.h"
+#endif
+
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
 
+#ifndef INTEL_MKL_ML_ONLY
+#include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
+
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 // output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
@@ -40,6 +50,7 @@
 // REQUIRES: perm is a permutation.
 
 namespace {
+#if !defined(INTEL_MKL_DNN_ONLY)
 template <typename T>
 Status MKLTranspose2D(const char trans, const Tensor& in, Tensor* out);
 
@@ -93,11 +104,64 @@
 static const char kMKLTranspose = 'T';
 static const char kMKLConjugateTranspose = 'C';
 
+#endif  // if !defined(INTEL_MKL_DNN_ONLY)
+
+#ifndef INTEL_MKL_ML_ONLY
+// MKL-DNN based Transpose implementation
+template <typename T>
+Status MKLTransposeND(OpKernelContext* ctx, const Tensor& in, Tensor* out,
+                      const gtl::ArraySlice<int32>& perm);
+
+static inline memory::dims ReorderStrides(const memory::dims& strides,
+                                          const gtl::ArraySlice<int32>& perm) {
+  memory::dims reordered_strides;
+  reordered_strides.resize(strides.size());
+  for (size_t i = 0; i < strides.size(); ++i) {
+    reordered_strides[perm[i]] = strides[i];
+  }
+  return reordered_strides;
+}
+
+// Transpose of N-dimensional tensor using MKL-DNN
+template <typename T>
+Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
+                      Tensor* out_tensor, const gtl::ArraySlice<int32>& perm) {
+  try {
+    engine cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> in(&cpu_engine);
+    MklDnnData<T> out(&cpu_engine);
+
+    memory::dims in_dims = TFShapeToMklDnnDims(in_tensor.shape());
+    memory::dims out_dims = TFShapeToMklDnnDims(out_tensor->shape());
+    memory::dims in_strides = CalculateTFStrides(in_dims);
+    // Reorder output strides based on permutation requested.
+    memory::dims out_strides =
+        ReorderStrides(CalculateTFStrides(out_dims), perm);
+
+    in.SetUsrMem(in_dims, in_strides, &in_tensor);
+    // Output dimensions are same as input dimensions. We adjust the layout
+    // using strides.
+    out.SetUsrMem(in_dims, out_strides, out_tensor);
+
+    std::vector<primitive> net;
+    net.push_back(in.CreateReorder(in.GetUsrMem(), out.GetUsrMem()));
+    stream(stream::kind::eager).submit(net).wait();
+    return Status::OK();
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + std::string(e.message) + ", in file " +
+                       std::string(__FILE__) + ":" + std::to_string(__LINE__);
+    return errors::Aborted("Operation received an exception:", error_msg);
+  }
+}
+#endif  // #ifndef INTEL_MKL_ML_ONLY
+
 }  // namespace
 
 Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                                       gtl::ArraySlice<int32> perm,
                                       Tensor* out) {
+#if !defined(INTEL_MKL_DNN_ONLY)
   if (in.dims() == 2) {
     if (perm[0] == 0 && perm[1] == 1) {
       return Status::OK();
@@ -115,7 +179,24 @@
         break;
     }
   }
-  // Fallback to eigen if transpose parameters not supported by MKL
+#endif
+
+#ifndef INTEL_MKL_ML_ONLY
+  // MKL-DNN has limit on the maximum number of dimensions in a tensor.
+  // Fallback to Eigen for not supported cases.
+  if (in.dims() <= TENSOR_MAX_DIMS) {
+    switch (in.dtype()) {
+      case DT_FLOAT:
+        return MKLTransposeND<float>(ctx, in, out, perm);
+        break;
+      // TODO(nhasabni): support other types such as INT8.
+      default:
+        break;
+    }
+  }
+#endif
+
+  // Fallback to eigen if transpose parameters not supported by MKL or MKL-DNN
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
                                    out);
@@ -125,6 +206,7 @@
                                                const Tensor& in,
                                                gtl::ArraySlice<int32> perm,
                                                Tensor* out) {
+#if !defined(INTEL_MKL_DNN_ONLY)
   if (in.dims() == 2 && perm[0] == 1 && perm[1] == 0) {
     // TODO(rmlarsen): By setting lda and ldb, we could use the MKL kernels
     // for any transpose that can be reduced to swapping the last two
@@ -143,7 +225,24 @@
         break;
     }
   }
-  // Fallback to eigen if transpose parameters not supported by MKL
+#endif
+
+#ifndef INTEL_MKL_ML_ONLY
+  // MKL-DNN has limit on the maximum number of dimensions in a tensor.
+  // Fallback to Eigen for not supported cases.
+  if (in.dims() <= TENSOR_MAX_DIMS) {
+    switch (in.dtype()) {
+      case DT_FLOAT:
+        return MKLTransposeND<float>(ctx, in, out, perm);
+        break;
+      // TODO(nhasabni): support other types such as INT8.
+      default:
+        break;
+    }
+  }
+#endif
+
+  // Fallback to eigen if transpose parameters not supported by MKL or MKL-DNN
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
                                             perm, out);
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index c7d0d4d..5d9257e 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -126,7 +126,7 @@
     const Tensor& max_output_size, const float score_threshold,
     const std::function<bool(int, int)>& suppress_check_fn,
     bool pad_to_max_output_size = false, int* ptr_num_valid_outputs = nullptr) {
-  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  const int output_size = max_output_size.scalar<int>()();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index ff553f1..a600d32 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -347,7 +347,7 @@
     default:
       return errors::Unimplemented(
           "HandleElementToLargerSliceWithRank Unhandled data type: ",
-          element.dtype());
+          DataTypeString(element.dtype()));
   }
 }
 
@@ -392,7 +392,7 @@
   TF_CALL_ALL_TYPES(HANDLE_TYPE);
 #undef HANDLE_TYPE
   return errors::Unimplemented("SetElementZero Unhandled data type: ",
-                               element->dtype());
+                               DataTypeString(element->dtype()));
 }
 
 std::vector<TensorShape> PaddingFIFOQueue::ConvertShapesPartialDimensionsToZero(
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index a7a9609..8db78f9 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -98,7 +98,8 @@
                           done);
         auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
         CopyGraph(*fbody->graph, graph.get());
-        OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
+        OP_REQUIRES_OK_ASYNC(ctx, PropagateInheritedDevices(graph.get(), args),
+                             done);
 
         DeviceSet device_set;
         for (auto d : lib->device_mgr()->ListDevices()) {
@@ -114,8 +115,16 @@
 
         // The FunctionLibraryRuntime's library cannot be mutated from within
         // an OpKernel, so functions are instantiated in an overlay library.
-        overlay_lib_.reset(new FunctionLibraryDefinition(
-            *lib->GetFunctionLibraryDefinition()));
+        OP_REQUIRES_ASYNC(
+            ctx, overlay_libs_.find(lib) == overlay_libs_.end(),
+            errors::Internal("Found an overlay library but did not "
+                             "find cached function partitions; "
+                             "this indicates a bug."),
+            done);
+        FunctionLibraryDefinition* overlay_lib =
+            new FunctionLibraryDefinition(*lib->GetFunctionLibraryDefinition());
+        overlay_libs_.emplace(lib, overlay_lib);
+
         auto handles = tensorflow::MakeUnique<gtl::FlatMap<string, FHandle>>();
         for (const auto& pair : subgraphs) {
           // TODO(akshayka): Fail gracefully if the set of devices corresponds
@@ -125,13 +134,13 @@
           OP_REQUIRES_OK_ASYNC(
               ctx, UpdateArgAndRetMetadata(target, subgraph.get()), done);
           FunctionDef shard;
-          string unique_name = UniquifyFunctionName(func_.name());
+          string unique_name = UniquifyFunctionName(overlay_lib, func_.name());
           OP_REQUIRES_OK_ASYNC(
               ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
-          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib_->AddFunctionDef(shard), done);
+          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib->AddFunctionDef(shard), done);
           FunctionLibraryRuntime::InstantiateOptions opts;
           opts.target = target;
-          opts.overlay_lib = overlay_lib_.get();
+          opts.overlay_lib = overlay_lib;
           FHandle handle;
           OP_REQUIRES_OK_ASYNC(
               ctx,
@@ -154,10 +163,15 @@
                     std::vector<AllocatorAttributes>>
       ArgAndRetAllocAttrs;
 
+  // Propagates device annotations from the outer graph to the function body.
+  //
   // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
   // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices.
-  Status PinResourceArgs(Graph* graph, const OpInputList& args) {
+  // access these resources to the appropriate devices. Additionally, places
+  // nodes that are unadorned with device annotations onto PartitiondCallOp's
+  // device. This lets call-site device annotations influence the execution
+  // of the function.
+  Status PropagateInheritedDevices(Graph* graph, const OpInputList& args) {
     for (Node* node : graph->op_nodes()) {
       string node_type = node->type_string();
       if (node_type == FunctionLibraryDefinition::kArgOp) {
@@ -170,6 +184,18 @@
           ResourceHandle handle = args[index].flat<ResourceHandle>()(0);
           node->set_assigned_device_name(handle.device());
         }
+      } else if (node_type != FunctionLibraryDefinition::kRetOp) {
+        // All non-RetVal nodes that weren't explicitly placed by the user
+        // inherit PartitionedCallOp's device. RetVal placement is inferred by
+        // the placer, to avoid forcing the function's outputs through a single
+        // device.
+        //
+        // TODO(b/112166045): Plumb the original requested device into this
+        // OpKernel (this->requested_device() isn't reliable), and merge it
+        // with node->requested_device() if possible.
+        if (node->requested_device().empty()) {
+          node->set_requested_device(local_device_name_);
+        }
       }
     }
     return Status::OK();
@@ -235,12 +261,6 @@
   //      device, and
   //  (3) records which `Arg` and `Retval` nodes live in host memory.
   Status UpdateArgAndRetMetadata(const string& device, Graph* subgraph) {
-    if (arg_and_ret_indices_.find(device) != arg_and_ret_indices_.end()) {
-      // This function has already been partitioned, albeit for a different
-      // function library.
-      return Status::OK();
-    }
-
     ArgAndRetIndices indices;
     std::vector<int>* arg_indices = &indices.first;
     std::vector<int>* ret_indices = &indices.second;
@@ -248,6 +268,8 @@
     std::vector<std::pair<Node*, int>> ret_nodes;
     const AttrValue* attr_value;
 
+    // Find the Arg and Retval nodes, along with their corresponding indices
+    // in the original function.
     for (Node* node : subgraph->op_nodes()) {
       string node_type = node->type_string();
       if (node_type == FunctionLibraryDefinition::kArgOp) {
@@ -263,6 +285,8 @@
       }
     }
 
+    // Rewrite the indices of the Arg and Retval nodes for this function
+    // to range from 0 to the number of Arg nodes, Retval nodes, respectively.
     auto sort_by_index = [](std::pair<Node*, int> one,
                             std::pair<Node*, int> two) -> bool {
       return one.second < two.second;
@@ -292,7 +316,12 @@
       arg_and_ret_alloc_attrs_[device].second.push_back(alloc_attr);
     }
 
-    arg_and_ret_indices_.emplace(device, indices);
+    // If this kernel execution corresponds to a StatefulPartitionedCallOp,
+    // `arg_and_ret_indices_` might have been populated by a previous
+    // invocation.
+    if (arg_and_ret_indices_.find(device) == arg_and_ret_indices_.end()) {
+      arg_and_ret_indices_.emplace(device, indices);
+    }
     return Status::OK();
   }
 
@@ -399,10 +428,11 @@
     }
   }
 
-  string UniquifyFunctionName(const string& name) {
+  string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
+                              const string& name) {
     for (;; ++suffix_) {
       const string candidate = strings::StrCat(name, "_", suffix_);
-      if (overlay_lib_->Find(candidate) == nullptr) {
+      if (function_library->Find(candidate) == nullptr) {
         return candidate;
       }
     }
@@ -410,14 +440,16 @@
 
   NameAttrList func_;
   string local_device_name_;
-  // Function shards are added to `overlay_lib_`.
-  std::unique_ptr<FunctionLibraryDefinition> overlay_lib_;
-  // Contains maps from device names to handles of function shards, keyed by
+  // Contains maps from device names to handles of function partitions, keyed by
   // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
   // for a stateful op, different invocations of it may use different FLRs.)
   gtl::FlatMap<FunctionLibraryRuntime*,
                std::unique_ptr<gtl::FlatMap<string, FHandle>>>
       function_handles_ GUARDED_BY(mu_);
+  // Function partitions are added to overlay libraries.
+  gtl::FlatMap<FunctionLibraryRuntime*,
+               std::unique_ptr<FunctionLibraryDefinition>>
+      overlay_libs_ GUARDED_BY(mu_);
   // Map from device name to the indices of the arguments and return values
   // placed on that device. Read-only after the first invocation.
   gtl::FlatMap<string, ArgAndRetIndices> arg_and_ret_indices_;
@@ -427,7 +459,7 @@
 
   mutex mu_;
 
-  // Used to uniquify function names in `overlay_lib_`.
+  // Used to uniquify function names in `overlay_libs_`.
   uint32 suffix_ = 0;
 };
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.h b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
index 350b1b6..2c36814 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@@ -17,8 +17,8 @@
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
-#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
 
 #define EIGEN_USE_GPU
 
@@ -45,4 +45,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
diff --git a/tensorflow/core/kernels/qr_op_impl.h b/tensorflow/core/kernels/qr_op_impl.h
index 0552c03..535df9d 100644
--- a/tensorflow/core/kernels/qr_op_impl.h
+++ b/tensorflow/core/kernels/qr_op_impl.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 //
 // This header file is used by the individual qr_*op*.cc files for registering
@@ -292,6 +295,8 @@
   TF_DISALLOW_COPY_AND_ASSIGN(QrOpGpu);
 };
 
-#endif
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 9af4cc2..88b3c2a 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -1058,4 +1061,6 @@
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index cab9eb7..ebcfb67 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -211,7 +211,8 @@
     OP_REQUIRES(context, dtype_ == context->input(1).dtype(),
                 errors::InvalidArgument(
                     "Variable and value dtypes don't match; respectively, ",
-                    dtype_, " and ", context->input(1).dtype()));
+                    DataTypeString(dtype_), " and ",
+                    DataTypeString(context->input(1).dtype())));
     Var* variable = nullptr;
     const Tensor& value = context->input(1);
     // Note: every resource-variable-manipulating op assumes copy-on-write
@@ -231,12 +232,12 @@
                                   return Status::OK();
                                 }));
     core::ScopedUnref s(variable);
+    mutex_lock ml(*variable->mu());
     OP_REQUIRES(context, variable->tensor()->dtype() == dtype_,
                 errors::InvalidArgument(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(dtype_)));
-    mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     *variable->tensor() = value;
   }
@@ -267,11 +268,6 @@
                                   return Status::OK();
                                 }));
     core::ScopedUnref s(variable);
-    OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
-                errors::InvalidArgument(
-                    "Trying to assign variable with wrong dtype. Expected ",
-                    DataTypeString(variable->tensor()->dtype()), " got ",
-                    DataTypeString(DT_VARIANT)));
 
     // For purposes of forwarding DT_VARIANT, we want the least
     // restrictive attr; we already know the input is on host.
@@ -292,6 +288,11 @@
         attr);
 
     mutex_lock ml(*variable->mu());
+    OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
+                errors::InvalidArgument(
+                    "Trying to assign variable with wrong dtype. Expected ",
+                    DataTypeString(variable->tensor()->dtype()), " got ",
+                    DataTypeString(DT_VARIANT)));
     variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 7930ce4..e335e38 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -25,6 +25,7 @@
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
@@ -96,7 +97,7 @@
               return tensor_names_flat(a) < tensor_names_flat(b);
             });
 
-  for (size_t i : sorted_name_idx) {
+  for (const size_t i : sorted_name_idx) {
     const string& name = tensor_names_flat(i);
     const Tensor& input = context->input(i + kFixedInputs);
     TensorShape shape(input.shape());
@@ -333,6 +334,26 @@
   BundleReader default_reader(Env::Default(), prefix_string);
   TF_RETURN_IF_ERROR(default_reader.status());
 
+  std::vector<string> mismatched_errors;
+  for (const size_t i : sorted_name_idx) {
+    TensorShape restored_full_shape;
+    DataType original_dtype;
+    const string& tensor_name = tensor_names_flat(i);
+    TF_RETURN_IF_ERROR(default_reader.LookupDtypeAndShape(
+        tensor_name, &original_dtype, &restored_full_shape));
+    if (dtypes[i] != original_dtype) {
+      string error_msg = strings::StrCat(
+          "tensor_name = ", tensor_name, "; expected dtype ",
+          DataTypeString(dtypes[i]), " does not equal original dtype ",
+          DataTypeString(original_dtype));
+      mismatched_errors.emplace_back(error_msg);
+    }
+  }
+  if (!mismatched_errors.empty()) {
+    const string error_msg = str_util::Join(mismatched_errors, "\n");
+    return errors::InvalidArgument(error_msg);
+  }
+
   for (auto i : sorted_name_idx) {
     const string& tensor_name = tensor_names_flat(i);
     const string& shape_and_slice = shape_and_slices_flat(i);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index 1d2fb69..69e754f 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -104,10 +104,11 @@
   void Compute(OpKernelContext* context) override {
     const Tensor& backing_tensor = context->input(0);
     // Check that type matches.
-    OP_REQUIRES(
-        context, backing_tensor.dtype() == dtype_,
-        errors::InvalidArgument("Backing tensor type ", backing_tensor.dtype(),
-                                " does not match expected type ", dtype_));
+    OP_REQUIRES(context, backing_tensor.dtype() == dtype_,
+                errors::InvalidArgument("Backing tensor type ",
+                                        DataTypeString(backing_tensor.dtype()),
+                                        " does not match expected type ",
+                                        DataTypeString(dtype_)));
     // Check that backing tensor is at least as large as the shape of the
     // output.
     OP_REQUIRES(context, backing_tensor.NumElements() >= shape_.num_elements(),
@@ -182,10 +183,11 @@
   void Compute(OpKernelContext* context) override {
     Tensor backing_copy(context->input(0));
     // Check that type matches.
-    OP_REQUIRES(
-        context, backing_copy.dtype() == dtype_,
-        errors::InvalidArgument("Backing tensor type ", backing_copy.dtype(),
-                                " does not match expected type ", dtype_));
+    OP_REQUIRES(context, backing_copy.dtype() == dtype_,
+                errors::InvalidArgument("Backing tensor type ",
+                                        DataTypeString(backing_copy.dtype()),
+                                        " does not match expected type ",
+                                        DataTypeString(dtype_)));
     const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
     const void* backing_tensor_lb = backing_buf->data();
     const void* backing_tensor_ub = static_cast<const void*>(
@@ -195,10 +197,11 @@
               << " to output " << i - 1 << " buf addr "
               << DMAHelper::base(&context->input(i));
       Tensor copy(context->input(i));
-      OP_REQUIRES(
-          context, copy.dtype() == dtype_,
-          errors::InvalidArgument("Input ", i, " tensor type ", copy.dtype(),
-                                  " does not match expected type ", dtype_));
+      OP_REQUIRES(context, copy.dtype() == dtype_,
+                  errors::InvalidArgument("Input ", i, " tensor type ",
+                                          DataTypeString(copy.dtype()),
+                                          " does not match expected type ",
+                                          DataTypeString(dtype_)));
       context->set_output(i - 1, copy);
       const TensorBuffer* input_buf = DMAHelper::buffer(&copy);
       const void* input_lb = input_buf->data();
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
index 271dd2c..b5274f8 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 
 #include "third_party/eigen3/Eigen/Core"
@@ -85,3 +88,5 @@
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 55be308..f75723a 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -154,6 +154,9 @@
     OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
                 errors::InvalidArgument("ExpandDims on Variant not supported"));
 
+    OP_REQUIRES(
+        ctx, (ctx->input(1).NumElements() == 1),
+        errors::InvalidArgument("'dim' must be a tensor with a single value"));
     Tdim dim = ctx->input(1).flat<Tdim>()(0);
     OP_REQUIRES(
         ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
@@ -236,9 +239,8 @@
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
                       errors::InvalidArgument(
-                          "Tried to explicitly squeeze "
-                          "dimension ",
-                          i, " but dimension was not 1: ", existing_dim));
+                          "Can not squeeze dim[", i,
+                          "], expected a dimension of 1, got ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/string_length_op.cc b/tensorflow/core/kernels/string_length_op.cc
new file mode 100644
index 0000000..a6829b2
--- /dev/null
+++ b/tensorflow/core/kernels/string_length_op.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace {
+
+class StringLengthOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    auto src = input.flat<string>();
+    auto dst = output->flat<int32>();
+
+    for (int n = 0; n < src.size(); ++n) {
+      dst(n) = src(n).size();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringLength").Device(DEVICE_CPU),
+                        StringLengthOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index a996b67..2a67700 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 //
 // This header file is used by the individual svd_*op*.cc files for registering
@@ -101,3 +104,5 @@
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 5aa5d20..b368ffc 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -40,6 +40,7 @@
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA
@@ -683,7 +684,7 @@
         output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
 
     // Insert the first value
-    input_tensors_flat.emplace_back(new ConstMatrix(
+    input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
         value_0_t->shaped<T, 2>({1, value_0_t->NumElements()})));
 
     for (int i = 1; i < num_indices; ++i) {
@@ -694,8 +695,8 @@
               "TensorArray has inconsistent shapes.  Index 0 has shape: ",
               value_0_t->shape().DebugString(), " but index ", i,
               " has shape: ", value_t->shape().DebugString()));
-      input_tensors_flat.emplace_back(
-          new ConstMatrix(value_t->shaped<T, 2>({1, value_t->NumElements()})));
+      input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
+          value_t->shaped<T, 2>({1, value_t->NumElements()})));
     }
 
 #if GOOGLE_CUDA
@@ -922,7 +923,7 @@
     for (size_t i = 0; i < values.size(); ++i) {
       const Tensor* value_t = value_tensors[i];
       if (value_t->NumElements() > 0) {
-        input_tensors_flat.emplace_back(new ConstMatrix(
+        input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
             value_t->shaped<T, 2>({1, value_t->NumElements()})));
       }
     }
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 68cdae3..d5d4fa8 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -31,6 +31,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
@@ -149,10 +150,12 @@
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
 
-    OP_REQUIRES(context, false,
-                errors::Unimplemented(
-                    "TileOp : Unhandled input dimensions, DT : ",
-                    context->input(0).dtype(), ", dims : ", input_dims));
+    OP_REQUIRES(
+        context, false,
+        errors::Unimplemented(
+            "TileOp : The input data type is not supported, DataType : ",
+            DataTypeString(context->input(0).dtype()),
+            ", Dimension : ", input_dims));
   }
 
  private:
@@ -330,9 +333,10 @@
 #undef HANDLE_DIM
 
     OP_REQUIRES(context, false,
-                errors::Unimplemented(
-                    "TileGradientOp : Unhandled input dimensions, DT : ",
-                    context->input(0).dtype(), ", dims : ", input_dims));
+                errors::Unimplemented("TileGradientOp : The input data type or "
+                                      "dimension is not supported, DataType : ",
+                                      DataTypeString(context->input(0).dtype()),
+                                      ", Dimension : ", input_dims));
   }
 
  private:
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 886b3e7..0f0f65c 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -218,7 +218,7 @@
                                             perm, out);
 }
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 #define REGISTER(T)                                   \
   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
                               .Device(DEVICE_CPU)     \
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 709b0a9..9e8c573 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -42,7 +42,7 @@
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 class MklTransposeCpuOp : public TransposeOp {
  public:
   explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
@@ -85,7 +85,7 @@
   bool IsConjugate() const override { return true; }
 };
 
-#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+#if defined(INTEL_MKL)
 class MklConjugateTransposeCpuOp : public TransposeOp {
  public:
   explicit MklConjugateTransposeCpuOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 31388e4..3559baa 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -69,7 +69,7 @@
                      axis_tensor.dtype() == DT_INT64),
                     errors::InvalidArgument(
                         "axis tensor should be int32 or int64, but got ",
-                        axis_tensor.dtype()));
+                        DataTypeString(axis_tensor.dtype())));
         if (axis_tensor.dtype() == DT_INT32) {
           axis = internal::SubtleMustCopy(axis_tensor.scalar<int32>()());
         } else {
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 57f5188..8879d9d 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -13,6 +13,9 @@
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -346,3 +349,5 @@
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index d7ecc44..329f115 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <iosfwd>
 #include <string>
+#include <type_traits>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -101,11 +102,18 @@
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
-  // Converts to `std::basic_string`.
-  template <typename A>
-  explicit operator std::basic_string<char, std::char_traits<char>, A>() const {
+  // Converts to various kinds of strings, including `std::basic_string`.
+  template <typename S>
+  explicit operator S() const {
+    static_assert(
+        std::is_same<char, typename S::value_type>::value,
+        "Type mismatch: S must be a string with character type char.");
+    static_assert(
+        std::is_same<std::char_traits<char>, typename S::traits_type>::value,
+        "Type mismatch: S must be a string with traits type "
+        "std::char_traits<char>.");
     if (!data()) return {};
-    return std::basic_string<char, std::char_traits<char>, A>(data(), size());
+    return S(data(), size());
   }
 
  private:
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index 952b9ea..e4b489f 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -56,8 +56,8 @@
 }
 
 TEST(StringPiece, ConversionToString) {
-  EXPECT_EQ("", std::string(StringPiece("")));
-  EXPECT_EQ("foo", std::string(StringPiece("foo")));
+  EXPECT_EQ("", string(StringPiece("")));
+  EXPECT_EQ("foo", string(StringPiece("foo")));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 62c803a..e226a15 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -232,11 +232,19 @@
     CommonFreeDecode(context);
     return false;
   }
-  if (context->channels == 0) {  // Autodetect number of channels
-    context->channels = png_get_channels(context->png_ptr, context->info_ptr);
-  }
   const bool has_tRNS =
       (png_get_valid(context->png_ptr, context->info_ptr, PNG_INFO_tRNS)) != 0;
+  if (context->channels == 0) {  // Autodetect number of channels
+    if (context->color_type == PNG_COLOR_TYPE_PALETTE) {
+      if (has_tRNS) {
+        context->channels = 4;  // RGB + A(tRNS)
+      } else {
+        context->channels = 3;  // RGB
+      }
+    } else {
+      context->channels = png_get_channels(context->png_ptr, context->info_ptr);
+    }
+  }
   const bool has_alpha = (context->color_type & PNG_COLOR_MASK_ALPHA) != 0;
   if ((context->channels & 1) == 0) {  // We desire alpha
     if (has_alpha) {                   // There is alpha
diff --git a/tensorflow/core/lib/png/testdata/lena_palette.png b/tensorflow/core/lib/png/testdata/lena_palette.png
new file mode 100644
index 0000000..d19ec04
--- /dev/null
+++ b/tensorflow/core/lib/png/testdata/lena_palette.png
Binary files differ
diff --git a/tensorflow/core/lib/png/testdata/lena_palette_trns.png b/tensorflow/core/lib/png/testdata/lena_palette_trns.png
new file mode 100644
index 0000000..c298fee
--- /dev/null
+++ b/tensorflow/core/lib/png/testdata/lena_palette_trns.png
Binary files differ
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 38bd851..1f2e57e 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -244,6 +244,27 @@
 }
 REGISTER_OP_GRADIENT("Split", SplitGrad);
 
+Status SplitVGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+      // Arg defs
+      {"x: T", "size_splits: Tlen", "dim: int32", "dy: num_split*T"},
+      // Ret val defs
+      {"dx: T", "d_size_splits: Tlen", "d_dim: int32"},
+      // Attr defs
+      {"T: type", "Tlen: type", "num_split: int"},
+      // Nodes
+      {
+        {{"dx"}, "Concat", {"dim", "dy"}, {{"T", "$T"}, {"N", "$num_split"}}},
+        {{"d_size_splits"}, "ZerosLike", {"size_splits"}, {{"T", "$Tlen"}}},
+        {{"d_dim"}, "ZerosLike", {"dim"}, {{"T", DT_INT32}}},
+      });
+  // clang-format on
+  VLOG(1) << "SplitVGrad " << DebugString(*g);
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("SplitV", SplitVGrad);
+
 Status ArrayToListGrad(const AttrSlice& attrs, FunctionDef* g) {
   int N;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &N));
diff --git a/tensorflow/core/ops/array_grad_test.cc b/tensorflow/core/ops/array_grad_test.cc
index e665d17..79d28a8 100644
--- a/tensorflow/core/ops/array_grad_test.cc
+++ b/tensorflow/core/ops/array_grad_test.cc
@@ -238,6 +238,39 @@
   return out;
 }
 
+std::vector<Tensor> SplitVGrad(const Tensor& x, const Tensor& size_splits,
+                               int dim, const Tensor& dy0, const Tensor& dy1) {
+  auto T = DT_FLOAT;
+  auto Tlen = DT_INT64;
+  auto gdef = test::function::GDef(
+      {f::NDef("x", "Placeholder", {}, {{"dtype", T}}),
+       f::NDef("size_splits", "Placeholder", {}, {{"dtype", Tlen}}),
+       f::NDef("dim", "Placeholder", {}, {{"dtype", DT_INT32}}),
+       f::NDef("dy0", "Placeholder", {}, {{"dtype", T}}),
+       f::NDef("dy1", "Placeholder", {}, {{"dtype", T}}),
+       f::NDef("dx", "SymbolicGradient",
+               {"x", "size_splits", "dim", "dy0", "dy1"},
+               {{"f", FDH::FunctionRef("SplitV", {{"split_dim", dim},
+                                                  {"num_split", 2},
+                                                  {"T", T},
+                                                  {"Tlen", Tlen}})},
+                {"Tin", DataTypeSlice{T, Tlen, DT_INT32, T, T}},
+                {"Tout", DataTypeSlice{T, Tlen, DT_INT32}}})});
+  VLOG(1) << DebugStringWhole(gdef);
+  auto sess = NewSession();
+  TF_CHECK_OK(sess->Create(gdef));
+  std::vector<Tensor> out;
+  TF_CHECK_OK(sess->Run({{"x:0", x},
+                         {"size_splits:0", size_splits},
+                         {"dim", test::AsScalar(dim)},
+                         {"dy0:0", dy0},
+                         {"dy1:0", dy1}},
+                        {"dx:0", "dx:1", "dx:2"}, {}, &out));
+  CHECK_EQ(out.size(), 3);
+  TF_CHECK_OK(sess->Close());
+  return out;
+}
+
 TEST(ArrayGradTest, SplitGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
@@ -245,15 +278,30 @@
   Tensor dy1(DT_FLOAT, {2, 2, 5});
   test::FillIota<float>(&dy0, 0);
   test::FillIota<float>(&dy1, 100);
-  auto dx = SplitGrad(1, x, dy0, dy1);
-  test::ExpectTensorEqual<int32>(dx[0], test::AsScalar(0));
-  test::ExpectClose(
-      dx[1], test::AsTensor<float>(
-                 {0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,
-                  100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
-                  10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,
-                  110., 111., 112., 113., 114., 115., 116., 117., 118., 119.},
-                 {2, 4, 5}));
+  auto expected_dx = test::AsTensor<float>(
+      {0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,
+       100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
+       10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,
+       110., 111., 112., 113., 114., 115., 116., 117., 118., 119.},
+      {2, 4, 5});
+  auto expected_d_dim = test::AsScalar(0);
+
+  // SplitGrad
+  {
+    auto dx = SplitGrad(1, x, dy0, dy1);
+    test::ExpectTensorEqual<int32>(dx[0], expected_d_dim);
+    test::ExpectClose(dx[1], expected_dx);
+  }
+  // SplitVGrad
+  {
+    Tensor size_splits(DT_INT64, {2});
+    size_splits.flat<int64>().setConstant(2);
+    auto expected_d_size_splits = test::AsTensor<int64>({0, 0}, {2});
+    auto dx = SplitVGrad(x, size_splits, 1, dy0, dy1);
+    test::ExpectClose(dx[0], expected_dx);
+    test::ExpectTensorEqual<int64>(dx[1], expected_d_size_splits);
+    test::ExpectTensorEqual<int32>(dx[2], expected_d_dim);
+  }
 }
 
 std::vector<Tensor> ReshapeGrad(const Tensor& x, const Tensor& s,
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index d6ae754..1d11ec0 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -427,7 +427,19 @@
     .Input("dims: Tidx")
     .Output("output: Tidx")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices = c->input(0);
+      ShapeHandle dims;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &dims));
+      if (c->RankKnown(indices) && c->Rank(indices) == 0) {
+        c->set_output(0, c->Vector(c->Dim(dims, 0)));
+      } else if (c->RankKnown(indices)) {
+        c->set_output(0, c->Matrix(c->Dim(dims, 0), c->NumElements(indices)));
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
+      return Status::OK();
+    });
 
 REGISTER_OP("BroadcastTo")
     .Input("input: T")
@@ -631,38 +643,41 @@
           return errors::InvalidArgument(
               "Length of size_splits should be equal to num_outputs");
         }
-        int64_t cumsum_outputs = 0;
+        int64_t total_size = 0;
         bool has_neg_one = false;
+        for (const auto size : data) {
+          if (size == -1) {
+            if (has_neg_one) {
+              return errors::InvalidArgument(
+                  "size_splits can only have one -1");
+            }
+            has_neg_one = true;
+          } else {
+            total_size += size;
+          }
+        }
+        auto split_dim_size = c->Value(c->Dim(input, split_dim));
         // If the sizes of the splits are known, then
         // make sure that the sizes add up to the expected
         // dimension size, with the possibility of a -1.
         // Specify the full output shapes.
         for (int i = 0; i < num_outputs; ++i) {
-          output_shape = c->UnknownShapeOfRank(rank);
-          TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
-                                           c->MakeDim(data[i]), &output_shape));
+          auto size = data[i];
+          if (data[i] == -1 && c->ValueKnown(split_dim_size)) {
+            size = split_dim_size - total_size;
+          }
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(input, split_dim, c->MakeDim(size), &output_shape));
           c->set_output(i, output_shape);
-          if (data[i] == -1 && !has_neg_one)
-            has_neg_one = true;
-          else if (data[i] == -1 && has_neg_one)
-            return errors::InvalidArgument("size_splits can only have one -1");
-          else
-            cumsum_outputs += data[i];
         }
-        auto split_dim_size = c->Value(c->Dim(input, split_dim));
-        if (has_neg_one) {
-          if (cumsum_outputs < split_dim_size)
-            cumsum_outputs = split_dim_size;
-          else
-            cumsum_outputs = split_dim_size + 1;
+        if (c->ValueKnown(split_dim_size)) {
+          if (has_neg_one ? total_size > split_dim_size
+                          : total_size != split_dim_size) {
+            return errors::InvalidArgument(
+                "can't split axis of size ", split_dim_size,
+                " into pieces of size [", str_util::Join(data, ","), "]");
+          }
         }
-        if (c->ValueKnown(c->Dim(input, split_dim)) &&
-            cumsum_outputs != c->Value(c->Dim(input, split_dim)))
-          return errors::InvalidArgument(
-              "Sum of output sizes must match "
-              "the size of the original Tensor along the split dimension "
-              "or the sum of the positive sizes must be less if it contains a "
-              "-1");
       }
 
       return Status::OK();
@@ -687,6 +702,16 @@
       return Status::OK();
     });
 
+// Returns a constant tensor on the host.  Useful for writing C++ tests
+// and benchmarks which run on GPU but require arguments pinned to the host.
+// Used by test::graph::HostConstant.
+// value: Attr `value` is the tensor to return.
+REGISTER_OP("HostConst")
+    .Output("output: dtype")
+    .Attr("value: tensor")
+    .Attr("dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 // --------------------------------------------------------------------------
 // TODO(mgubin): Update the doc when the freeze_graph script supports converting
 // into memmapped format.
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index b146333..c15409a 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -27,6 +27,21 @@
 
 namespace tensorflow {
 
+TEST(ArrayOpsTest, UnravelIndex_ShapeFn) {
+  ShapeInferenceTestOp op("UnravelIndex");
+
+  INFER_OK(op, "?;?", "?");
+
+  INFER_OK(op, "[];[?]", "[d1_0]");
+
+  INFER_OK(op, "[4,5];[?]", "[d1_0,20]");
+  INFER_OK(op, "[2,3,4];[?]", "[d1_0,24]");
+  INFER_OK(op, "?;[?]", "?");
+  INFER_OK(op, "[?];[?]", "[d1_0,?]");
+
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[1,1]");
+}
+
 TEST(ArrayOpsTest, Pack_ShapeFn) {
   ShapeInferenceTestOp op("Pack");
   auto set_axis = [&op](int axis) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 3418fcf..72b9477 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -6488,6 +6488,69 @@
   }
 }
 op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
   name: "Asin"
   input_arg {
     name: "x"
@@ -25550,6 +25613,21 @@
   }
 }
 op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
   name: "IFFT"
   input_arg {
     name: "input"
@@ -29286,6 +29364,39 @@
   }
 }
 op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
   name: "MapIncompleteSize"
   output_arg {
     name: "size"
@@ -68306,6 +68417,43 @@
   is_stateful: true
 }
 op {
+  name: "StatelessIf"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
   name: "StatelessMultinomial"
   input_arg {
     name: "logits"
@@ -68662,6 +68810,30 @@
   }
 }
 op {
+  name: "StatelessWhile"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
+op {
   name: "StatsAggregatorHandle"
   output_arg {
     name: "handle"
@@ -68962,6 +69134,17 @@
   }
 }
 op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
   name: "StringSplit"
   input_arg {
     name: "input"
@@ -73234,6 +73417,41 @@
   }
 }
 op {
+  name: "UnsafeDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
   name: "UnsortedSegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 7a02454..13733d4 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -854,4 +854,46 @@
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("MapDefun")
+    .Input("arguments: Targuments")
+    .Output("output: output_types")
+    .Attr("Targuments: list(type) >= 1")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("f: func")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<TensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as `output_types` (",
+            output_shapes.size(), " vs. ", c->num_outputs(), ")");
+      }
+
+      int64 dim_zero = -1;
+      for (size_t i = 0; i < static_cast<size_t>(c->num_inputs()); ++i) {
+        auto dim_handle = c->Dim(c->input(i), 0);
+        if (c->ValueKnown(dim_handle)) {
+          if (dim_zero == -1) {
+            dim_zero = c->Value(dim_handle);
+          } else if (c->Value(dim_handle) != dim_zero) {
+            return errors::InvalidArgument(
+                "Inputs must have the same dimension 0.");
+          }
+        }
+      }
+
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        PartialTensorShape s({});
+        s = s.Concatenate(dim_zero);
+        s = s.Concatenate(output_shapes[i]);
+        shape_inference::ShapeHandle output_shape_handle;
+
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromPartialTensorShape(s, &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index a16eccc..bda4a75 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -90,6 +90,17 @@
     tensors.  whose types are the same as what then_branch returns.
 )doc");
 
+REGISTER_OP("StatelessIf")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("If")
     .Input("cond: Tcond")
     .Input("input: Tin")
@@ -133,8 +144,6 @@
       by T.
 )doc");
 
-// TODO(b/37549631) setting the While Op to always be stateful is too
-// conservative.
 REGISTER_OP("While")
     .Input("input: T")
     .Output("output: T")
@@ -149,6 +158,19 @@
       return Status::OK();
     });
 
+REGISTER_OP("StatelessWhile")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("For")
     .Input("start: int32")
     .Input("limit: int32")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 31267f7..11ca0bd 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -108,6 +108,29 @@
   return Status::OK();
 }
 
+Status NMSShapeFn(InferenceContext* c) {
+  // Get inputs and validate ranks.
+  ShapeHandle boxes;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+  ShapeHandle scores;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+  ShapeHandle max_output_size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+  ShapeHandle iou_threshold;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+  ShapeHandle score_threshold;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &score_threshold));
+  // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+  DimensionHandle unused;
+  // The boxes[0] and scores[0] are both num_boxes.
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+  // The boxes[1] is 4.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
+  c->set_output(0, c->Vector(c->UnknownDim()));
+  return Status::OK();
+}
+
 }  // namespace
 
 // --------------------------------------------------------------------------
@@ -348,6 +371,11 @@
     .Attr("T: {uint8, int8, int16, int32, int64, float, double}")
     .Deprecated(2, "Use AdjustContrastv2 instead")
     .SetShapeFn([](InferenceContext* c) {
+      // The contrast_factor, min_value, max_value should be scalar only.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
@@ -357,6 +385,9 @@
     .Input("contrast_factor: float")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) {
+      // The contrast_factor should be scalar only.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
@@ -686,29 +717,7 @@
     .Input("iou_threshold: float")
     .Input("score_threshold: float")
     .Output("selected_indices: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      // Get inputs and validate ranks.
-      ShapeHandle boxes;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
-      ShapeHandle scores;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
-      ShapeHandle max_output_size;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
-      ShapeHandle iou_threshold;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
-      ShapeHandle score_threshold;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &score_threshold));
-      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
-      DimensionHandle unused;
-      // The boxes[0] and scores[0] are both num_boxes.
-      TF_RETURN_IF_ERROR(
-          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
-      // The boxes[1] is 4.
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
-
-      c->set_output(0, c->Vector(c->UnknownDim()));
-      return Status::OK();
-    });
+    .SetShapeFn(NMSShapeFn);
 
 REGISTER_OP("NonMaxSuppressionV4")
     .Input("boxes: float")
@@ -720,26 +729,16 @@
     .Output("valid_outputs: int32")
     .Attr("pad_to_max_output_size: bool = false")
     .SetShapeFn([](InferenceContext* c) {
-      // Get inputs and validate ranks.
-      ShapeHandle boxes;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
-      ShapeHandle scores;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
-      ShapeHandle max_output_size;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
-      ShapeHandle iou_threshold;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
-      ShapeHandle score_threshold;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &score_threshold));
-      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
-      DimensionHandle unused;
-      // The boxes[0] and scores[0] are both num_boxes.
-      TF_RETURN_IF_ERROR(
-          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
-      // The boxes[1] is 4.
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+      TF_RETURN_IF_ERROR(NMSShapeFn(c));
 
-      c->set_output(0, c->Vector(c->UnknownDim()));
+      bool pad_to_max;
+      TF_RETURN_IF_ERROR(c->GetAttr("pad_to_max_output_size", &pad_to_max));
+      if (pad_to_max) {
+        // If padded, overwrite the shape of the output to be static.
+        DimensionHandle output_dim;
+        TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(2, &output_dim));
+        c->set_output(0, c->MakeShape({output_dim}));
+      }
       c->set_output(1, c->MakeShape({}));
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 2059741..7c71406 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -23,6 +23,7 @@
 
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeAndType;
 using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
@@ -86,6 +87,74 @@
       return Status::OK();
     });
 
+Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
+                                   const string& key_dtype_attr,
+                                   const string& value_dtype_attr,
+                                   bool is_lookup,
+                                   ShapeAndType* output_shape_and_type) {
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data == nullptr || handle_data->size() != 2) {
+    output_shape_and_type->shape = c->UnknownShape();
+    output_shape_and_type->dtype = DT_INVALID;
+  } else {
+    const ShapeAndType& key_shape_and_type = (*handle_data)[0];
+    const ShapeAndType& value_shape_and_type = (*handle_data)[1];
+    DataType key_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
+    if (key_shape_and_type.dtype != key_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(key_shape_and_type.dtype), " got ",
+          DataTypeString(key_dtype));
+    }
+    DataType value_dtype;
+    TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
+    if (value_shape_and_type.dtype != value_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read value with wrong dtype. "
+          "Expected ",
+          DataTypeString(value_shape_and_type.dtype), " got ",
+          DataTypeString(value_dtype));
+    }
+    output_shape_and_type->dtype = value_shape_and_type.dtype;
+
+    if (is_lookup) {
+      if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
+        int keys_rank = c->Rank(keys);
+        int key_suffix_rank = c->Rank(key_shape_and_type.shape);
+        if (keys_rank < key_suffix_rank) {
+          return errors::InvalidArgument(
+              "Expected keys to have suffix ",
+              c->DebugString(key_shape_and_type.shape),
+              " but saw shape: ", c->DebugString(keys));
+        }
+        for (int d = 0; d < key_suffix_rank; d++) {
+          // Ensure the suffix of keys match what's in the Table.
+          DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
+        }
+        std::vector<DimensionHandle> keys_prefix_vec;
+        keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
+        for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
+          keys_prefix_vec.push_back(c->Dim(keys, d));
+        }
+        ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
+        TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix,
+                                          value_shape_and_type.shape,
+                                          &output_shape_and_type->shape));
+      } else {
+        output_shape_and_type->shape = c->UnknownShape();
+      }
+    } else {
+      TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape,
+                                        &output_shape_and_type->shape));
+    }
+  }
+  return Status::OK();
+}
+
 REGISTER_OP("LookupTableFindV2")
     .Input("table_handle: resource")
     .Input("keys: Tin")
@@ -98,9 +167,18 @@
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
 
       // Default value must be scalar or vector.
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
-      c->set_output(0, c->UnknownShape());
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &keys));
+
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/c->input(1),
+          /*key_dtype_attr=*/"Tin",
+          /*value_dtype_attr=*/"Tout",
+          /*is_lookup=*/true, &value_shape_and_type));
+      c->set_output(0, value_shape_and_type.shape);
+
       return Status::OK();
     });
 WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
@@ -177,12 +255,16 @@
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle handle;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      ShapeHandle values = c->UnknownShape();
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
-      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeAndType value_shape_and_type;
+      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
+          c,
+          /*keys=*/keys,
+          /*key_dtype_attr=*/"Tkeys",
+          /*value_dtype_attr=*/"Tvalues",
+          /*is_lookup=*/false, &value_shape_and_type));
       c->set_output(0, keys);
-      c->set_output(1, values);
+      c->set_output(1, value_shape_and_type.shape);
       return Status::OK();
     });
 
@@ -216,6 +298,26 @@
       return Status::OK();
     });
 
+Status MutableHashTableShape(InferenceContext* c, const ShapeHandle& key,
+                             const ShapeHandle& value) {
+  c->set_output(0, c->Scalar());
+
+  ShapeHandle key_s;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s));
+
+  DataType key_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t));
+
+  DataType value_t;
+  TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t));
+
+  // ShapeAndType vector for {key, value}.
+  c->set_output_handle_shapes_and_types(
+      0, std::vector<ShapeAndType>{{key_s, key_t}, {value, value_t}});
+
+  return Status::OK();
+}
+
 REGISTER_OP("HashTable")
     .Output("table_handle: Ref(string)")
     .Attr("container: string = ''")
@@ -254,7 +356,10 @@
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput);
+    .SetShapeFn([](InferenceContext* c) {
+      return MutableHashTableShape(c, /*key=*/c->Scalar(),
+                                   /*value=*/c->Scalar());
+    });
 
 REGISTER_OP("MutableHashTableOfTensors")
     .Output("table_handle: Ref(string)")
@@ -276,7 +381,13 @@
     .Attr("value_dtype: type")
     .Attr("value_shape: shape = {}")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput);
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape value_p;
+      TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p));
+      ShapeHandle value_s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s));
+      return MutableHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s);
+    });
 
 REGISTER_OP("MutableDenseHashTable")
     .Input("empty_key: key_dtype")
@@ -304,7 +415,13 @@
     .Attr("initial_num_buckets: int = 131072")  // 2^17
     .Attr("max_load_factor: float = 0.8")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput);
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape value_p;
+      TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p));
+      ShapeHandle value_s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s));
+      return MutableHashTableShape(c, /*key=*/c->input(0), /*value=*/value_s);
+    });
 
 REGISTER_OP("InitializeTable")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 783d292..57499a6 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -495,6 +495,19 @@
 }
 REGISTER_OP_GRADIENT("RealDiv", RealDivGrad);
 
+Status UnsafeDivGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForBinaryCwise(g, {
+      {{"gx"}, "UnsafeDiv", {"dz", "y"}},
+      {{"nx"}, "Neg", {"x"}, {}, {"dz"}},
+      {{"y2"}, "Square", {"y"}, {}, {"dz"}},
+      {{"nx_y2"}, "UnsafeDiv", {"nx", "y2"}},
+      {{"gy"}, "Mul", {"dz", "nx_y2"}},  // dz * (- x / y^2)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("UnsafeDiv", UnsafeDivGrad);
+
 Status PowGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   std::vector<FDH::Node> nodes = {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 2a27ef3..b0d1595 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -753,6 +753,78 @@
   }
 }
 
+TEST_F(MathGradTest, UnsafeDiv) {
+  auto x = test::AsTensor<float>(
+      {0.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 0.f}, TensorShape({3, 3}));
+  auto y = test::AsTensor<float>({-10.f, 0.f, 10.f}, TensorShape({3, 1}));
+  Tensor dx;
+  Tensor dy;
+  {
+    SymGrad("UnsafeDiv", x, y, &dx, &dy);
+    {
+      auto g = [](float x, float y) {
+        if (y == 0.f) {
+          return 0.f;
+        } else {
+          return 1.f / y;
+        }
+      };
+      test::ExpectClose(dx, test::AsTensor<float>(
+                                {g(0.f, -10.f), g(-3.f, -10.f), g(-2.f, -10.f),
+                                 g(-1.f, 0.f), g(0.f, 0.f), g(1.f, 0.f),
+                                 g(2.f, 10.f), g(3.f, 10.f), g(0.f, 10.f)},
+                                TensorShape({3, 3})));
+    }
+    {
+      auto g = [](float x, float y) {
+        if (y == 0.f) {
+          return 0.f;
+        } else {
+          return -x / (y * y);
+        }
+      };
+      test::ExpectClose(dy,
+                        test::AsTensor<float>(
+                            {g(0.f, -10.f) + g(-3.f, -10.f) + g(-2.f, -10.f),
+                             g(-1.f, 0.f) + g(0.f, 0.f) + g(1.f, 0.f),
+                             g(2.f, 10.f) + g(3.f, 10.f) + g(0.f, 10.f)},
+                            TensorShape({3, 1})));
+    }
+  }
+  {  // Swap x and y.
+    SymGrad("UnsafeDiv", y, x, &dy, &dx);
+    {
+      auto g = [](float x, float y) {
+        if (y == 0.f) {
+          return 0.f;
+        } else {
+          return 1.f / y;
+        }
+      };
+      test::ExpectClose(dy,
+                        test::AsTensor<float>(
+                            {g(-10.f, 0.f) + g(-10.f, -3.f) + g(-10.f, -2.f),
+                             g(0.f, -1.f) + g(0.f, 0.f) + g(0.f, 1.f),
+                             g(10.f, 2.f) + g(10.f, 3.f) + g(10.f, 0.f)},
+                            TensorShape({3, 1})));
+    }
+    {
+      auto g = [](float x, float y) {
+        if (y == 0.f) {
+          return 0.f;
+        } else {
+          return -x / (y * y);
+        }
+      };
+      test::ExpectClose(dx, test::AsTensor<float>(
+                                {g(-10.f, 0.f), g(-10.f, -3.f), g(-10.f, -2.f),
+                                 g(0.f, -1.f), g(0.f, 0.f), g(0.f, 1.f),
+                                 g(10.f, 2.f), g(10.f, 3.f), g(10.f, 0.f)},
+                                TensorShape({3, 3})));
+    }
+  }
+}
+
 TEST_F(MathGradTest, Pow) {
   auto x = test::AsTensor<float>({0.f, 1.f, 2.f, 3.f, 4.f, 5.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 1667c39..49646f1 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -392,6 +392,10 @@
 REGISTER_OP("Div").BINARY_MORE().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
+REGISTER_OP("UnsafeDiv")
+    .BINARY_MORE()
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 REGISTER_OP("FloorDiv")
     .BINARY_MORE()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 23f1538..ebeb048 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -120,7 +120,8 @@
                               "Maximum",    "Minimum",
                               "Mod",        "Mul",
                               "NotEqual",   "Pow",
-                              "Sub",        "SquaredDifference"}) {
+                              "Sub",        "SquaredDifference",
+                              "UnsafeDiv"}) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[1,2];?", "?");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index f947d4c..385021b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1687,7 +1687,7 @@
 expected to invoke these operators.
 )doc");
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1736,6 +1736,87 @@
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklConv3D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv3DShape)
+    .Doc(R"doc(
+MKL version of Conv3D operator. Uses MKL DNN APIs to perform 3D convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv3DBackpropInputV2")
+    .Input("input_sizes: Tshape")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input_sizes: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int) >= 5")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Convolution3D backward input. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv3DBackpropFilterV2")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv3DBackpropFilter. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklRelu")
     .Input("features: T")
     .Input("mkl_features: uint8")
@@ -1849,7 +1930,7 @@
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1875,7 +1956,7 @@
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -1947,7 +2028,7 @@
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1975,7 +2056,7 @@
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -2161,7 +2242,7 @@
     .Input("mkl_input: uint8")
     .Output("output: T")
     .Attr("T: {half, float, double}")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
@@ -2183,7 +2264,7 @@
     .Attr(
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index a67678a..f259527 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -1982,6 +1982,7 @@
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_BOOL
@@ -12256,6 +12257,21 @@
   }
 }
 op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
   name: "IFFT"
   input_arg {
     name: "input"
@@ -14511,6 +14527,39 @@
   }
 }
 op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
   name: "MapIncompleteSize"
   output_arg {
     name: "size"
@@ -31480,6 +31529,43 @@
   is_stateful: true
 }
 op {
+  name: "StatelessIf"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
   name: "StatelessMultinomial"
   input_arg {
     name: "logits"
@@ -31710,6 +31796,30 @@
   }
 }
 op {
+  name: "StatelessWhile"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
+op {
   name: "StatsAggregatorHandle"
   output_arg {
     name: "handle"
@@ -32010,6 +32120,17 @@
   }
 }
 op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
   name: "StringSplit"
   input_arg {
     name: "input"
@@ -34839,6 +34960,41 @@
   }
 }
 op {
+  name: "UnsafeDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
   name: "UnsortedSegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 4423062..d1e38e6 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,9 @@
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
+    .Attr(
+        "T: {int8, int16, int32, int64, complex64, complex128, float, double, "
+        "bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -157,6 +159,11 @@
     .Output("output: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("StringLength")
+    .Input("input: string")
+    .Output("output: int32")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 67c872a..9d33787 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -618,9 +618,11 @@
 }
 
 /// \brief Utility function to split a comma delimited list of strings to an
-/// unordered set
-bool SplitByCommaToSet(StringPiece list, std::unordered_set<string>* set) {
-  std::vector<string> vector = str_util::Split(list, ",");
+/// unordered set, lowercasing all values.
+bool SplitByCommaToLowercaseSet(StringPiece list,
+                                std::unordered_set<string>* set) {
+  std::vector<string> vector =
+      str_util::Split(tensorflow::str_util::Lowercase(list), ",");
   *set = std::unordered_set<string>(vector.begin(), vector.end());
   return true;
 }
@@ -778,7 +780,8 @@
     throttle_.SetConfig(config);
   }
 
-  GetEnvVar(kAllowedBucketLocations, SplitByCommaToSet, &allowed_locations_);
+  GetEnvVar(kAllowedBucketLocations, SplitByCommaToLowercaseSet,
+            &allowed_locations_);
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -1155,8 +1158,11 @@
     Status status = GetBucketMetadata(bucket, &result_buffer);
     Json::Value result;
     TF_RETURN_IF_ERROR(ParseJson(result_buffer, &result));
+    string bucket_location;
     TF_RETURN_IF_ERROR(
-        GetStringValue(result, kBucketMetadataLocationKey, location));
+        GetStringValue(result, kBucketMetadataLocationKey, &bucket_location));
+    // Lowercase the GCS location to be case insensitive for allowed locations.
+    *location = tensorflow::str_util::Lowercase(bucket_location);
     return Status::OK();
   };
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index ee2b034..14376ad 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -98,7 +98,7 @@
       "Timeouts: 5 1 10\n",
       R"(
           {
-            "location":"us-east1"
+            "location":"US-EAST1"
           })")});
 
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -124,7 +124,7 @@
            "Timeouts: 5 1 10\n",
            R"(
           {
-            "location":"us-east1"
+            "location":"US-EAST1"
           })"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/anotherbucket\n"
@@ -132,7 +132,7 @@
            "Timeouts: 5 1 10\n",
            R"(
           {
-            "location":"us-east1"
+            "location":"US-EAST1"
           })"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
@@ -140,7 +140,7 @@
            "Timeouts: 5 1 10\n",
            R"(
           {
-            "location":"us-east1"
+            "location":"US-EAST1"
           })")});
 
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -181,7 +181,7 @@
       "Timeouts: 5 1 10\n",
       R"(
           {
-            "location":"barfoo"
+            "location":"BARFOO"
           })")});
 
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -3076,7 +3076,7 @@
   GcsFileSystem fs1;
   EXPECT_EQ(*kAllowedLocationsAuto, fs1.allowed_locations());
 
-  setenv("GCS_ALLOWED_BUCKET_LOCATIONS", "custom,list", 1);
+  setenv("GCS_ALLOWED_BUCKET_LOCATIONS", "CUSTOM,list", 1);
   GcsFileSystem fs2;
   EXPECT_EQ(std::unordered_set<string>({"custom", "list"}),
             fs2.allowed_locations());
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 2889132..7251c6c 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -8,7 +8,7 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "if_mkl_ml",
 )
 
 # Appends a suffix to a list of deps.
@@ -467,7 +467,6 @@
 
   return select({
     "//tensorflow:windows" : native.glob(windows_set),
-    "//tensorflow:windows_msvc" : native.glob(windows_set),
     "//conditions:default" : native.glob(posix_set),
   })
 
@@ -479,7 +478,6 @@
   ], exclude = exclude)
   return select({
     "//tensorflow:windows" : windows_hdrs,
-    "//tensorflow:windows_msvc" : windows_hdrs,
     "//conditions:default" : native.glob([
         "platform/default/*.h",
         "platform/posix/*.h",
@@ -494,7 +492,6 @@
   ], exclude = exclude)
   return select({
     "//tensorflow:windows" : windows_srcs,
-    "//tensorflow:windows_msvc" : windows_srcs,
     "//conditions:default" : native.glob([
         "platform/default/*.cc",
         "platform/posix/*.cc",
@@ -516,6 +513,11 @@
       "platform/windows/integral_types.h",
   ])
 
+def tf_additional_proto_compiler_hdrs():
+  return [
+      "platform/default/protobuf_compiler.h"
+  ]
+
 def tf_additional_proto_srcs():
   return [
       "platform/default/protobuf.cc",
@@ -662,6 +664,11 @@
       "//tensorflow/core/platform/default/build_config:proto_parsing",
   ]
 
+def tf_lib_proto_compiler_deps():
+  return [
+      "@protobuf_archive//:protoc_lib",
+  ]
+
 def tf_additional_verbs_lib_defines():
   return select({
       "//tensorflow:with_verbs_support": ["TENSORFLOW_USE_VERBS"],
@@ -703,8 +710,8 @@
       # core).
       "//tensorflow/core/kernels:lookup_util",
       "//tensorflow/core/util/tensor_bundle",
-  ] + if_mkl(
+  ] + if_mkl_ml(
       [
-          "//third_party/mkl:intel_binary_blob",
+          "//third_party/intel_mkl_ml",
       ],
   )
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index c732c76..bd9d41c 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -20,8 +20,8 @@
 // IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
 
 #include "google/protobuf/arena.h"
-#include "google/protobuf/compiler/importer.h"
 #include "google/protobuf/descriptor.h"
+#include "google/protobuf/descriptor.pb.h"
 #include "google/protobuf/dynamic_message.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
diff --git a/tensorflow/core/platform/default/protobuf_compiler.h b/tensorflow/core/platform/default/protobuf_compiler.h
new file mode 100644
index 0000000..a93d7a1
--- /dev/null
+++ b/tensorflow/core/platform/default/protobuf_compiler.h
@@ -0,0 +1,25 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+
+// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf_compiler.h"
+// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf_compiler.h
+
+#include "google/protobuf/compiler/importer.h"
+#include "tensorflow/core/platform/default/protobuf.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/protobuf_compiler.h b/tensorflow/core/platform/protobuf_compiler.h
new file mode 100644
index 0000000..29679e0
--- /dev/null
+++ b/tensorflow/core/platform/protobuf_compiler.h
@@ -0,0 +1,25 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
+#define TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
+
+#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
+#include "tensorflow/core/platform/google/protobuf_compiler.h"
+#else
+#include "tensorflow/core/platform/default/protobuf_compiler.h"
+#endif
+
+#endif  // TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index d5f5dec..462113f 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -26,7 +26,6 @@
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
-#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@@ -254,10 +253,8 @@
     outfile_->clear();
     outfile_->seekp(offset);
     if (!putObjectOutcome.IsSuccess()) {
-      string error = strings::StrCat(
-          putObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
-          putObjectOutcome.GetError().GetMessage().c_str());
-      return errors::Internal(error);
+      return errors::Unknown(putObjectOutcome.GetError().GetExceptionName(),
+                             ": ", putObjectOutcome.GetError().GetMessage());
     }
     return Status::OK();
   }
@@ -410,10 +407,8 @@
     auto listObjectsOutcome =
         this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
-      string error = strings::StrCat(
-          listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
-          listObjectsOutcome.GetError().GetMessage().c_str());
-      return errors::Internal(error);
+      return errors::Unknown(listObjectsOutcome.GetError().GetExceptionName(),
+                             ": ", listObjectsOutcome.GetError().GetMessage());
     }
 
     listObjectsResult = listObjectsOutcome.GetResult();
@@ -447,10 +442,8 @@
     headBucketRequest.WithBucket(bucket.c_str());
     auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
-      string error = strings::StrCat(
-          headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
-          headBucketOutcome.GetError().GetMessage().c_str());
-      return errors::Internal(error);
+      return errors::Unknown(headBucketOutcome.GetError().GetExceptionName(),
+                             ": ", headBucketOutcome.GetError().GetMessage());
     }
     stats->length = 0;
     stats->is_directory = 1;
@@ -511,10 +504,8 @@
   auto deleteObjectOutcome =
       this->GetS3Client()->DeleteObject(deleteObjectRequest);
   if (!deleteObjectOutcome.IsSuccess()) {
-    string error = strings::StrCat(
-        deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
-        deleteObjectOutcome.GetError().GetMessage().c_str());
-    return errors::Internal(error);
+    return errors::Unknown(deleteObjectOutcome.GetError().GetExceptionName(),
+                           ": ", deleteObjectOutcome.GetError().GetMessage());
   }
   return Status::OK();
 }
@@ -612,10 +603,8 @@
     auto listObjectsOutcome =
         this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
-      string error = strings::StrCat(
-          listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
-          listObjectsOutcome.GetError().GetMessage().c_str());
-      return errors::Internal(error);
+      return errors::Unknown(listObjectsOutcome.GetError().GetExceptionName(),
+                             ": ", listObjectsOutcome.GetError().GetMessage());
     }
 
     listObjectsResult = listObjectsOutcome.GetResult();
@@ -633,10 +622,8 @@
       auto copyObjectOutcome =
           this->GetS3Client()->CopyObject(copyObjectRequest);
       if (!copyObjectOutcome.IsSuccess()) {
-        string error = strings::StrCat(
-            copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
-            copyObjectOutcome.GetError().GetMessage().c_str());
-        return errors::Internal(error);
+        return errors::Unknown(copyObjectOutcome.GetError().GetExceptionName(),
+                               ": ", copyObjectOutcome.GetError().GetMessage());
       }
 
       deleteObjectRequest.SetBucket(src_bucket.c_str());
@@ -645,10 +632,9 @@
       auto deleteObjectOutcome =
           this->GetS3Client()->DeleteObject(deleteObjectRequest);
       if (!deleteObjectOutcome.IsSuccess()) {
-        string error = strings::StrCat(
-            deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
-            deleteObjectOutcome.GetError().GetMessage().c_str());
-        return errors::Internal(error);
+        return errors::Unknown(
+            deleteObjectOutcome.GetError().GetExceptionName(), ": ",
+            deleteObjectOutcome.GetError().GetMessage());
       }
     }
     listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6f564e7..5635641 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/env_var.h b/tensorflow/core/util/env_var.h
index 47f9ff3..724ca35 100644
--- a/tensorflow/core/util/env_var.h
+++ b/tensorflow/core/util/env_var.h
@@ -13,7 +13,8 @@
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_ENV_VAR_H_
+#ifndef TENSORFLOW_CORE_UTIL_ENV_VAR_H_
+#define TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -42,4 +43,4 @@
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_ENV_VAR_H_
+#endif  // TENSORFLOW_CORE_UTIL_ENV_VAR_H_
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index c50e329..aaaba91 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -69,6 +69,10 @@
                       static_cast<int64>(time_in_seconds),
                       port::Hostname().c_str(), file_suffix_.c_str());
 
+  // Reset recordio_writer (which has a reference to recordio_file_) so final
+  // Flush() and Close() call have access to recordio_file_.
+  recordio_writer_.reset();
+
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       env_->NewWritableFile(filename_, &recordio_file_),
       "Creating writable file ", filename_);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index a66b121..422be93 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -22,7 +22,17 @@
 #include <utility>
 #include <vector>
 
-#ifdef INTEL_MKL_ML
+#if defined(INTEL_MKL_ML_ONLY) || defined(INTEL_MKL_DNN_ONLY)
+#ifndef INTEL_MKL
+#error "INTEL_MKL_{ML,DNN}_ONLY require INTEL_MKL"
+#endif
+#endif
+
+#if defined(INTEL_MKL_ML_ONLY) && defined(INTEL_MKL_DNN_ONLY)
+#error "at most one of INTEL_MKL_ML_ONLY and INTEL_MKL_DNN_ONLY may be defined"
+#endif
+
+#ifdef INTEL_MKL_ML_ONLY
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #include "mkl_service.h"
@@ -40,7 +50,8 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#ifndef INTEL_MKL_ML
+
+#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -76,7 +87,17 @@
   Dim_I = 1
 } MklDnnDims;
 
-#ifdef INTEL_MKL_ML
+typedef enum {
+  Dim3d_N = 0,
+  Dim3d_C = 1,
+  Dim3d_D = 2,
+  Dim3d_H = 3,
+  Dim3d_W = 4,
+  Dim3d_O = 0,
+  Dim3d_I = 1
+} MklDnnDims3D;
+
+#ifdef INTEL_MKL_ML_ONLY
 class MklShape {
  public:
   MklShape() {}
@@ -340,6 +361,7 @@
 #else
 
 // Forward decl
+TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format);
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
 memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
 memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
@@ -442,6 +464,13 @@
     return this->DimSize(index);
   }
 
+  inline size_t GetDimension3D(char dimension) const {
+    int index = GetMklDnnTensor3DDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+        << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
   inline int32 GetMklDnnTensorDimIndex(char dimension) const {
     switch (dimension) {
       case 'N':
@@ -458,6 +487,24 @@
     }
   }
 
+  inline int32 GetMklDnnTensor3DDimIndex(char dimension) const {
+    switch (dimension) {
+      case 'N':
+        return MklDnnDims3D::Dim3d_N;
+      case 'C':
+        return MklDnnDims3D::Dim3d_C;
+      case 'D':
+        return MklDnnDims3D::Dim3d_D;
+      case 'H':
+        return MklDnnDims3D::Dim3d_H;
+      case 'W':
+        return MklDnnDims3D::Dim3d_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
   inline size_t GetDimension() const { return data_.dimension_; }
   inline const int* GetSizes() const {
     return reinterpret_cast<const int*>(&data_.sizes_[0]);
@@ -576,13 +623,26 @@
   }
 
   inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
-    // TODO(nhasabni): Why do we restrict this to 4D?
-    CHECK_EQ(dimension, 4);
-    CHECK(dimension == data_.dimension_);
-    data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+    if (dimension == 5) {
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<3>(data_format, '0')] =
+          MklDnnDims3D::Dim3d_D;
+      data_.map_[GetTensorDimIndex<3>(data_format, '1')] =
+          MklDnnDims3D::Dim3d_H;
+      data_.map_[GetTensorDimIndex<3>(data_format, '2')] =
+          MklDnnDims3D::Dim3d_W;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'C')] =
+          MklDnnDims3D::Dim3d_C;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'N')] =
+          MklDnnDims3D::Dim3d_N;
+    } else {
+      CHECK_EQ(dimension, 4);
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+    }
   }
 
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
@@ -670,14 +730,13 @@
 
 // List of MklShape objects. Used in Concat/Split layers.
 
-
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 #else
 typedef std::vector<MklShape> MklShapeList;
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -760,7 +819,7 @@
 #endif
 
 // Get the MKL shape from the second string tensor
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
   mklshape->DeSerializeMklShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -795,7 +854,7 @@
   ctext->input_list(name, input_tensors);
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
@@ -825,7 +884,7 @@
 
 #endif
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -845,7 +904,7 @@
 }
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -878,7 +937,7 @@
 }
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -923,7 +982,7 @@
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -972,7 +1031,7 @@
   }
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 inline void MklSizesToTFSizes(OpKernelContext* context,
                               TensorFormat data_format_,
                               const MklShape& mkl_shape,
@@ -1016,7 +1075,7 @@
   }
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   int index = GetMklTensorDimIndex(dimension);
   CHECK(index >= 0 && index < mkl_shape.GetDimension())
@@ -1046,7 +1105,7 @@
   context->set_output(idx_meta_out, meta_output);
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
                                          int idx_out,
                                          const TensorShape& shape) {
@@ -1084,7 +1143,7 @@
 }
 #endif
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
                                    int idx_out) {
@@ -1142,7 +1201,7 @@
   }
 }
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 // Set a dummy MKLDNN shape (called when the output is in TF format)
 inline void SetDummyMklDnnShapeOutput(OpKernelContext* context,
                                       uint32 idx_data_out) {
@@ -1186,7 +1245,7 @@
   }
 }
 
-#ifdef INTEL_MKL_ML
+#ifdef INTEL_MKL_ML_ONLY
 // Set a dummy MKL shape (called when the output is in TF format)
 inline void SetDummyMklShapeOutput(OpKernelContext* context,
                                    uint32 idx_data_out) {
@@ -1303,7 +1362,7 @@
 #endif
 // -------------------------------------------------------------------
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1319,6 +1378,19 @@
   return memory::data_type::f32;
 }
 
+/// Map TensorFlow's data format into MKL-DNN 3D data format
+/// @input: TensorFlow data format
+/// @return: memory::format corresponding to TensorFlow data format;
+///          Fails with an error if invalid data format.
+inline memory::format TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC)
+    return memory::format::ndhwc;
+  else if (format == FORMAT_NCHW)
+    return memory::format::ncdhw;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  return memory::format::format_undef;
+}
+
 /// Map TensorFlow's data format into MKL-DNN data format
 ///
 /// @input: TensorFlow data format
@@ -1330,7 +1402,6 @@
   else if (format == FORMAT_NCHW)
     return memory::format::nchw;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  // Return to get rid of compiler warning
   return memory::format::format_undef;
 }
 
@@ -1340,9 +1411,9 @@
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc)
+  if (format == memory::format::nhwc || format == memory::format::ndhwc)
     return FORMAT_NHWC;
-  else if (format == memory::format::nchw)
+  else if (format == memory::format::nchw || format == memory::format::ncdhw)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
@@ -1392,6 +1463,22 @@
   return memory::dims({n, c, h, w});
 }
 
+inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
+                                               TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
+           memory::format::format_undef);
+
+  int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
+  int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
+  int d = shape.dim_size(GetTensorDimIndex<3>(format, '0'));
+  int h = shape.dim_size(GetTensorDimIndex<3>(format, '1'));
+  int w = shape.dim_size(GetTensorDimIndex<3>(format, '2'));
+
+  // MKL-DNN requires dimensions in NCDHW format.
+  return memory::dims({n, c, d, h, w});
+}
+
 /// Overloaded version of function above. Input parameters are
 /// self-explanatory.
 inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
@@ -1504,6 +1591,8 @@
 
   /// Operations memory descriptor
   memory::desc* op_md_;
+  // flat to indicate if data is 3D or not.
+  bool bIs3D;
   /// Operations temp buffer
   void* allocated_buffer_;
   /// CPU engine on which operation will be executed
@@ -1530,6 +1619,10 @@
         static_cast<const void*>(tensor->flat<T>().data()));
   }
 
+  void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; }
+
+  bool GetIs3D() { return bIs3D; }
+
   /// Set user memory primitive using specified dimensions, memory format and
   /// data_buffer. Function automatically uses element data type by using
   /// input type T used for creating call object.
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index cd1d071..4f837f1 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@
 namespace tensorflow {
 namespace {
 
-#ifndef INTEL_MKL_ML
+#ifndef INTEL_MKL_ML_ONLY
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL_ML_ONLY
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index a5f7ecf..f331973 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -25,6 +25,10 @@
   return "data_format: { 'NDHWC', 'NCDHW' } = 'NDHWC' ";
 }
 
+string GetConvnetDataFormat2D3DAttrString() {
+  return "data_format: { 'NHWC', 'NCHW', 'NDHWC', 'NCDHW' } = 'NHWC' ";
+}
+
 string GetConvnetFilterFormatAttrString() {
   return "filter_format: { 'HWIO', 'OIHW' } = 'HWIO' ";
 }
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 918835e..b0c349d 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -483,6 +483,7 @@
 // Return the string that specifies the filter format for convnet operations.
 string GetConvnetFilterFormatAttrString();
 string GetConvnet3dFilterFormatAttrString();
+string GetConvnetDataFormat2D3DAttrString();
 
 // Returns a tensor shape for the specified format and dimension sizes.
 // Works for both 2D and 3D operations. The output shapes are as follows:
diff --git a/tensorflow/docs_src/about/index.md b/tensorflow/docs_src/about/index.md
index dc1e9af..c3c13ff 100644
--- a/tensorflow/docs_src/about/index.md
+++ b/tensorflow/docs_src/about/index.md
@@ -3,9 +3,9 @@
 This section provides a few documents about TensorFlow itself,
 including the following:
 
-  * @{$uses$TensorFlow in Use}, which provides a link to our model zoo and
+  * [TensorFlow in Use](../about/uses.md), which provides a link to our model zoo and
     lists some popular ways that TensorFlow is being used.
-  * @{$bib$TensorFlow White Papers}, which provides abstracts of white papers
+  * [TensorFlow White Papers](../about/bib.md), which provides abstracts of white papers
     about TensorFlow.
-  * @{$attribution$Attribution}, which specifies how to attribute and refer
+  * [Attribution](../about/attribution.md), which specifies how to attribute and refer
     to TensorFlow.
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md
index 4e51ada..2cd645a 100644
--- a/tensorflow/docs_src/api_guides/cc/guide.md
+++ b/tensorflow/docs_src/api_guides/cc/guide.md
@@ -7,6 +7,12 @@
 [`master` version of this doc](https://www.tensorflow.org/versions/master/api_guides/cc/guide),
 in case there have been any changes.
 
+Note: The C++ API is only designed to work with TensorFlow `bazel build`.
+If you need a stand-alone option use the [C-api](../../install/install_c.md).
+See [these instructions](https://docs.bazel.build/versions/master/external.html)
+for details on how to include TensorFlow as a subproject (instead of building
+your project from inside TensorFlow, as in this example).
+
 [TOC]
 
 TensorFlow's C++ API provides mechanisms for constructing and executing a data
@@ -92,7 +98,7 @@
 
 ### Scope
 
-@{tensorflow::Scope} is the main data structure that holds the current state
+`tensorflow::Scope` is the main data structure that holds the current state
 of graph construction. A `Scope` acts as a handle to the graph being
 constructed, as well as storing TensorFlow operation properties. The `Scope`
 object is the first argument to operation constructors, and operations that use
@@ -102,7 +108,7 @@
 
 Create a new `Scope` object by calling `Scope::NewRootScope`. This creates
 some resources such as a graph to which operations are added. It also creates a
-@{tensorflow::Status} object which will be used to indicate errors encountered
+`tensorflow::Status` object which will be used to indicate errors encountered
 when constructing operations. The `Scope` class has value semantics, thus, a
 `Scope` object can be freely copied and passed around.
 
@@ -121,7 +127,7 @@
 * Device placement for an operation
 * Kernel attribute for an operation
 
-Please refer to @{tensorflow::Scope} for the complete list of member functions
+Please refer to `tensorflow::Scope` for the complete list of member functions
 that let you create child scopes with new properties.
 
 ### Operation Constructors
@@ -213,7 +219,7 @@
 
 You may pass many different types of C++ values directly to tensor
 constants. You may explicitly create a tensor constant by calling the
-@{tensorflow::ops::Const} function from various kinds of C++ values. For
+`tensorflow::ops::Const` function from various kinds of C++ values. For
 example:
 
 * Scalars
@@ -257,7 +263,7 @@
 ## Graph Execution
 
 When executing a graph, you will need a session. The C++ API provides a
-@{tensorflow::ClientSession} class that will execute ops created by the
+`tensorflow::ClientSession` class that will execute ops created by the
 operation constructors. TensorFlow will automatically determine which parts of
 the graph need to be executed, and what values need feeding. For example:
 
@@ -291,5 +297,5 @@
 // outputs[0] == [4 5; 6 7]
 ```
 
-Please see the @{tensorflow::Tensor} documentation for more information on how
+Please see the `tensorflow::Tensor` documentation for more information on how
 to use the execution output.
diff --git a/tensorflow/docs_src/api_guides/python/array_ops.md b/tensorflow/docs_src/api_guides/python/array_ops.md
index a34f01f..ddeea80 100644
--- a/tensorflow/docs_src/api_guides/python/array_ops.md
+++ b/tensorflow/docs_src/api_guides/python/array_ops.md
@@ -1,7 +1,7 @@
 # Tensor Transformations
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -10,78 +10,78 @@
 TensorFlow provides several operations that you can use to cast tensor data
 types in your graph.
 
-*   @{tf.string_to_number}
-*   @{tf.to_double}
-*   @{tf.to_float}
-*   @{tf.to_bfloat16}
-*   @{tf.to_int32}
-*   @{tf.to_int64}
-*   @{tf.cast}
-*   @{tf.bitcast}
-*   @{tf.saturate_cast}
+*   `tf.string_to_number`
+*   `tf.to_double`
+*   `tf.to_float`
+*   `tf.to_bfloat16`
+*   `tf.to_int32`
+*   `tf.to_int64`
+*   `tf.cast`
+*   `tf.bitcast`
+*   `tf.saturate_cast`
 
 ## Shapes and Shaping
 
 TensorFlow provides several operations that you can use to determine the shape
 of a tensor and change the shape of a tensor.
 
-*   @{tf.broadcast_dynamic_shape}
-*   @{tf.broadcast_static_shape}
-*   @{tf.shape}
-*   @{tf.shape_n}
-*   @{tf.size}
-*   @{tf.rank}
-*   @{tf.reshape}
-*   @{tf.squeeze}
-*   @{tf.expand_dims}
-*   @{tf.meshgrid}
+*   `tf.broadcast_dynamic_shape`
+*   `tf.broadcast_static_shape`
+*   `tf.shape`
+*   `tf.shape_n`
+*   `tf.size`
+*   `tf.rank`
+*   `tf.reshape`
+*   `tf.squeeze`
+*   `tf.expand_dims`
+*   `tf.meshgrid`
 
 ## Slicing and Joining
 
 TensorFlow provides several operations to slice or extract parts of a tensor,
 or join multiple tensors together.
 
-*   @{tf.slice}
-*   @{tf.strided_slice}
-*   @{tf.split}
-*   @{tf.tile}
-*   @{tf.pad}
-*   @{tf.concat}
-*   @{tf.stack}
-*   @{tf.parallel_stack}
-*   @{tf.unstack}
-*   @{tf.reverse_sequence}
-*   @{tf.reverse}
-*   @{tf.reverse_v2}
-*   @{tf.transpose}
-*   @{tf.extract_image_patches}
-*   @{tf.space_to_batch_nd}
-*   @{tf.space_to_batch}
-*   @{tf.required_space_to_batch_paddings}
-*   @{tf.batch_to_space_nd}
-*   @{tf.batch_to_space}
-*   @{tf.space_to_depth}
-*   @{tf.depth_to_space}
-*   @{tf.gather}
-*   @{tf.gather_nd}
-*   @{tf.unique_with_counts}
-*   @{tf.scatter_nd}
-*   @{tf.dynamic_partition}
-*   @{tf.dynamic_stitch}
-*   @{tf.boolean_mask}
-*   @{tf.one_hot}
-*   @{tf.sequence_mask}
-*   @{tf.dequantize}
-*   @{tf.quantize_v2}
-*   @{tf.quantized_concat}
-*   @{tf.setdiff1d}
+*   `tf.slice`
+*   `tf.strided_slice`
+*   `tf.split`
+*   `tf.tile`
+*   `tf.pad`
+*   `tf.concat`
+*   `tf.stack`
+*   `tf.parallel_stack`
+*   `tf.unstack`
+*   `tf.reverse_sequence`
+*   `tf.reverse`
+*   `tf.reverse_v2`
+*   `tf.transpose`
+*   `tf.extract_image_patches`
+*   `tf.space_to_batch_nd`
+*   `tf.space_to_batch`
+*   `tf.required_space_to_batch_paddings`
+*   `tf.batch_to_space_nd`
+*   `tf.batch_to_space`
+*   `tf.space_to_depth`
+*   `tf.depth_to_space`
+*   `tf.gather`
+*   `tf.gather_nd`
+*   `tf.unique_with_counts`
+*   `tf.scatter_nd`
+*   `tf.dynamic_partition`
+*   `tf.dynamic_stitch`
+*   `tf.boolean_mask`
+*   `tf.one_hot`
+*   `tf.sequence_mask`
+*   `tf.dequantize`
+*   `tf.quantize_v2`
+*   `tf.quantized_concat`
+*   `tf.setdiff1d`
 
 ## Fake quantization
 Operations used to help train for better quantization accuracy.
 
-*   @{tf.fake_quant_with_min_max_args}
-*   @{tf.fake_quant_with_min_max_args_gradient}
-*   @{tf.fake_quant_with_min_max_vars}
-*   @{tf.fake_quant_with_min_max_vars_gradient}
-*   @{tf.fake_quant_with_min_max_vars_per_channel}
-*   @{tf.fake_quant_with_min_max_vars_per_channel_gradient}
+*   `tf.fake_quant_with_min_max_args`
+*   `tf.fake_quant_with_min_max_args_gradient`
+*   `tf.fake_quant_with_min_max_vars`
+*   `tf.fake_quant_with_min_max_vars_gradient`
+*   `tf.fake_quant_with_min_max_vars_per_channel`
+*   `tf.fake_quant_with_min_max_vars_per_channel_gradient`
diff --git a/tensorflow/docs_src/api_guides/python/check_ops.md b/tensorflow/docs_src/api_guides/python/check_ops.md
index 6f8a18a..b52fdaa 100644
--- a/tensorflow/docs_src/api_guides/python/check_ops.md
+++ b/tensorflow/docs_src/api_guides/python/check_ops.md
@@ -1,19 +1,19 @@
 # Asserts and boolean checks
 
-*   @{tf.assert_negative}
-*   @{tf.assert_positive}
-*   @{tf.assert_proper_iterable}
-*   @{tf.assert_non_negative}
-*   @{tf.assert_non_positive}
-*   @{tf.assert_equal}
-*   @{tf.assert_integer}
-*   @{tf.assert_less}
-*   @{tf.assert_less_equal}
-*   @{tf.assert_greater}
-*   @{tf.assert_greater_equal}
-*   @{tf.assert_rank}
-*   @{tf.assert_rank_at_least}
-*   @{tf.assert_type}
-*   @{tf.is_non_decreasing}
-*   @{tf.is_numeric_tensor}
-*   @{tf.is_strictly_increasing}
+*   `tf.assert_negative`
+*   `tf.assert_positive`
+*   `tf.assert_proper_iterable`
+*   `tf.assert_non_negative`
+*   `tf.assert_non_positive`
+*   `tf.assert_equal`
+*   `tf.assert_integer`
+*   `tf.assert_less`
+*   `tf.assert_less_equal`
+*   `tf.assert_greater`
+*   `tf.assert_greater_equal`
+*   `tf.assert_rank`
+*   `tf.assert_rank_at_least`
+*   `tf.assert_type`
+*   `tf.is_non_decreasing`
+*   `tf.is_numeric_tensor`
+*   `tf.is_strictly_increasing`
diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md
index 27fc861..fdd48e6 100644
--- a/tensorflow/docs_src/api_guides/python/client.md
+++ b/tensorflow/docs_src/api_guides/python/client.md
@@ -3,34 +3,34 @@
 
 This library contains classes for launching graphs and executing operations.
 
-@{$guide/low_level_intro$This guide} has examples of how a graph
-is launched in a @{tf.Session}.
+[This guide](../../guide/low_level_intro.md) has examples of how a graph
+is launched in a `tf.Session`.
 
 ## Session management
 
-*   @{tf.Session}
-*   @{tf.InteractiveSession}
-*   @{tf.get_default_session}
+*   `tf.Session`
+*   `tf.InteractiveSession`
+*   `tf.get_default_session`
 
 ## Error classes and convenience functions
 
-*   @{tf.OpError}
-*   @{tf.errors.CancelledError}
-*   @{tf.errors.UnknownError}
-*   @{tf.errors.InvalidArgumentError}
-*   @{tf.errors.DeadlineExceededError}
-*   @{tf.errors.NotFoundError}
-*   @{tf.errors.AlreadyExistsError}
-*   @{tf.errors.PermissionDeniedError}
-*   @{tf.errors.UnauthenticatedError}
-*   @{tf.errors.ResourceExhaustedError}
-*   @{tf.errors.FailedPreconditionError}
-*   @{tf.errors.AbortedError}
-*   @{tf.errors.OutOfRangeError}
-*   @{tf.errors.UnimplementedError}
-*   @{tf.errors.InternalError}
-*   @{tf.errors.UnavailableError}
-*   @{tf.errors.DataLossError}
-*   @{tf.errors.exception_type_from_error_code}
-*   @{tf.errors.error_code_from_exception_type}
-*   @{tf.errors.raise_exception_on_not_ok_status}
+*   `tf.OpError`
+*   `tf.errors.CancelledError`
+*   `tf.errors.UnknownError`
+*   `tf.errors.InvalidArgumentError`
+*   `tf.errors.DeadlineExceededError`
+*   `tf.errors.NotFoundError`
+*   `tf.errors.AlreadyExistsError`
+*   `tf.errors.PermissionDeniedError`
+*   `tf.errors.UnauthenticatedError`
+*   `tf.errors.ResourceExhaustedError`
+*   `tf.errors.FailedPreconditionError`
+*   `tf.errors.AbortedError`
+*   `tf.errors.OutOfRangeError`
+*   `tf.errors.UnimplementedError`
+*   `tf.errors.InternalError`
+*   `tf.errors.UnavailableError`
+*   `tf.errors.DataLossError`
+*   `tf.errors.exception_type_from_error_code`
+*   `tf.errors.error_code_from_exception_type`
+*   `tf.errors.raise_exception_on_not_ok_status`
diff --git a/tensorflow/docs_src/api_guides/python/constant_op.md b/tensorflow/docs_src/api_guides/python/constant_op.md
index db3410c..9ba95b0 100644
--- a/tensorflow/docs_src/api_guides/python/constant_op.md
+++ b/tensorflow/docs_src/api_guides/python/constant_op.md
@@ -1,7 +1,7 @@
 # Constants, Sequences, and Random Values
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -9,17 +9,17 @@
 
 TensorFlow provides several operations that you can use to generate constants.
 
-*   @{tf.zeros}
-*   @{tf.zeros_like}
-*   @{tf.ones}
-*   @{tf.ones_like}
-*   @{tf.fill}
-*   @{tf.constant}
+*   `tf.zeros`
+*   `tf.zeros_like`
+*   `tf.ones`
+*   `tf.ones_like`
+*   `tf.fill`
+*   `tf.constant`
 
 ## Sequences
 
-*   @{tf.linspace}
-*   @{tf.range}
+*   `tf.linspace`
+*   `tf.range`
 
 ## Random Tensors
 
@@ -29,11 +29,11 @@
 
 The `seed` keyword argument in these functions acts in conjunction with
 the graph-level random seed. Changing either the graph-level seed using
-@{tf.set_random_seed} or the
+`tf.set_random_seed` or the
 op-level seed will change the underlying seed of these operations. Setting
 neither graph-level nor op-level seed, results in a random seed for all
 operations.
-See @{tf.set_random_seed}
+See `tf.set_random_seed`
 for details on the interaction between operation-level and graph-level random
 seeds.
 
@@ -64,7 +64,7 @@
 ```
 
 Another common use of random values is the initialization of variables. Also see
-the @{$variables$Variables How To}.
+the [Variables How To](../../guide/variables.md).
 
 ```python
 # Use random uniform values in [0, 1) as the initializer for a variable of shape
@@ -77,11 +77,11 @@
 print(sess.run(var))
 ```
 
-*   @{tf.random_normal}
-*   @{tf.truncated_normal}
-*   @{tf.random_uniform}
-*   @{tf.random_shuffle}
-*   @{tf.random_crop}
-*   @{tf.multinomial}
-*   @{tf.random_gamma}
-*   @{tf.set_random_seed}
+*   `tf.random_normal`
+*   `tf.truncated_normal`
+*   `tf.random_uniform`
+*   `tf.random_shuffle`
+*   `tf.random_crop`
+*   `tf.multinomial`
+*   `tf.random_gamma`
+*   `tf.set_random_seed`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.crf.md b/tensorflow/docs_src/api_guides/python/contrib.crf.md
index 428383f..a544f13 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.crf.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.crf.md
@@ -2,10 +2,10 @@
 
 Linear-chain CRF layer.
 
-*   @{tf.contrib.crf.crf_sequence_score}
-*   @{tf.contrib.crf.crf_log_norm}
-*   @{tf.contrib.crf.crf_log_likelihood}
-*   @{tf.contrib.crf.crf_unary_score}
-*   @{tf.contrib.crf.crf_binary_score}
-*   @{tf.contrib.crf.CrfForwardRnnCell}
-*   @{tf.contrib.crf.viterbi_decode}
+*   `tf.contrib.crf.crf_sequence_score`
+*   `tf.contrib.crf.crf_log_norm`
+*   `tf.contrib.crf.crf_log_likelihood`
+*   `tf.contrib.crf.crf_unary_score`
+*   `tf.contrib.crf.crf_binary_score`
+*   `tf.contrib.crf.CrfForwardRnnCell`
+*   `tf.contrib.crf.viterbi_decode`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md b/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md
index 2794868..7df7547 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.ffmpeg.md
@@ -19,5 +19,5 @@
     waveform, file_format='wav', samples_per_second=44100)
 ```
 
-*   @{tf.contrib.ffmpeg.decode_audio}
-*   @{tf.contrib.ffmpeg.encode_audio}
+*   `tf.contrib.ffmpeg.decode_audio`
+*   `tf.contrib.ffmpeg.encode_audio`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.framework.md b/tensorflow/docs_src/api_guides/python/contrib.framework.md
index 6b4ce3a..00fb8b0 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.framework.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.framework.md
@@ -3,62 +3,62 @@
 
 Framework utilities.
 
-*   @{tf.contrib.framework.assert_same_float_dtype}
-*   @{tf.contrib.framework.assert_scalar}
-*   @{tf.contrib.framework.assert_scalar_int}
-*   @{tf.convert_to_tensor_or_sparse_tensor}
-*   @{tf.contrib.framework.get_graph_from_inputs}
-*   @{tf.is_numeric_tensor}
-*   @{tf.is_non_decreasing}
-*   @{tf.is_strictly_increasing}
-*   @{tf.contrib.framework.is_tensor}
-*   @{tf.contrib.framework.reduce_sum_n}
-*   @{tf.contrib.framework.remove_squeezable_dimensions}
-*   @{tf.contrib.framework.with_shape}
-*   @{tf.contrib.framework.with_same_shape}
+*   `tf.contrib.framework.assert_same_float_dtype`
+*   `tf.contrib.framework.assert_scalar`
+*   `tf.contrib.framework.assert_scalar_int`
+*   `tf.convert_to_tensor_or_sparse_tensor`
+*   `tf.contrib.framework.get_graph_from_inputs`
+*   `tf.is_numeric_tensor`
+*   `tf.is_non_decreasing`
+*   `tf.is_strictly_increasing`
+*   `tf.contrib.framework.is_tensor`
+*   `tf.contrib.framework.reduce_sum_n`
+*   `tf.contrib.framework.remove_squeezable_dimensions`
+*   `tf.contrib.framework.with_shape`
+*   `tf.contrib.framework.with_same_shape`
 
 ## Deprecation
 
-*   @{tf.contrib.framework.deprecated}
-*   @{tf.contrib.framework.deprecated_args}
-*   @{tf.contrib.framework.deprecated_arg_values}
+*   `tf.contrib.framework.deprecated`
+*   `tf.contrib.framework.deprecated_args`
+*   `tf.contrib.framework.deprecated_arg_values`
 
 ## Arg_Scope
 
-*   @{tf.contrib.framework.arg_scope}
-*   @{tf.contrib.framework.add_arg_scope}
-*   @{tf.contrib.framework.has_arg_scope}
-*   @{tf.contrib.framework.arg_scoped_arguments}
+*   `tf.contrib.framework.arg_scope`
+*   `tf.contrib.framework.add_arg_scope`
+*   `tf.contrib.framework.has_arg_scope`
+*   `tf.contrib.framework.arg_scoped_arguments`
 
 ## Variables
 
-*   @{tf.contrib.framework.add_model_variable}
-*   @{tf.train.assert_global_step}
-*   @{tf.contrib.framework.assert_or_get_global_step}
-*   @{tf.contrib.framework.assign_from_checkpoint}
-*   @{tf.contrib.framework.assign_from_checkpoint_fn}
-*   @{tf.contrib.framework.assign_from_values}
-*   @{tf.contrib.framework.assign_from_values_fn}
-*   @{tf.contrib.framework.create_global_step}
-*   @{tf.contrib.framework.filter_variables}
-*   @{tf.train.get_global_step}
-*   @{tf.contrib.framework.get_or_create_global_step}
-*   @{tf.contrib.framework.get_local_variables}
-*   @{tf.contrib.framework.get_model_variables}
-*   @{tf.contrib.framework.get_unique_variable}
-*   @{tf.contrib.framework.get_variables_by_name}
-*   @{tf.contrib.framework.get_variables_by_suffix}
-*   @{tf.contrib.framework.get_variables_to_restore}
-*   @{tf.contrib.framework.get_variables}
-*   @{tf.contrib.framework.local_variable}
-*   @{tf.contrib.framework.model_variable}
-*   @{tf.contrib.framework.variable}
-*   @{tf.contrib.framework.VariableDeviceChooser}
-*   @{tf.contrib.framework.zero_initializer}
+*   `tf.contrib.framework.add_model_variable`
+*   `tf.train.assert_global_step`
+*   `tf.contrib.framework.assert_or_get_global_step`
+*   `tf.contrib.framework.assign_from_checkpoint`
+*   `tf.contrib.framework.assign_from_checkpoint_fn`
+*   `tf.contrib.framework.assign_from_values`
+*   `tf.contrib.framework.assign_from_values_fn`
+*   `tf.contrib.framework.create_global_step`
+*   `tf.contrib.framework.filter_variables`
+*   `tf.train.get_global_step`
+*   `tf.contrib.framework.get_or_create_global_step`
+*   `tf.contrib.framework.get_local_variables`
+*   `tf.contrib.framework.get_model_variables`
+*   `tf.contrib.framework.get_unique_variable`
+*   `tf.contrib.framework.get_variables_by_name`
+*   `tf.contrib.framework.get_variables_by_suffix`
+*   `tf.contrib.framework.get_variables_to_restore`
+*   `tf.contrib.framework.get_variables`
+*   `tf.contrib.framework.local_variable`
+*   `tf.contrib.framework.model_variable`
+*   `tf.contrib.framework.variable`
+*   `tf.contrib.framework.VariableDeviceChooser`
+*   `tf.contrib.framework.zero_initializer`
 
 ## Checkpoint utilities
 
-*   @{tf.contrib.framework.load_checkpoint}
-*   @{tf.contrib.framework.list_variables}
-*   @{tf.contrib.framework.load_variable}
-*   @{tf.contrib.framework.init_from_checkpoint}
+*   `tf.contrib.framework.load_checkpoint`
+*   `tf.contrib.framework.list_variables`
+*   `tf.contrib.framework.load_variable`
+*   `tf.contrib.framework.init_from_checkpoint`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index 20fe88a..8ce49b95 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -100,78 +100,78 @@
 
 ## Module: util
 
-*   @{tf.contrib.graph_editor.make_list_of_op}
-*   @{tf.contrib.graph_editor.get_tensors}
-*   @{tf.contrib.graph_editor.make_list_of_t}
-*   @{tf.contrib.graph_editor.get_generating_ops}
-*   @{tf.contrib.graph_editor.get_consuming_ops}
-*   @{tf.contrib.graph_editor.ControlOutputs}
-*   @{tf.contrib.graph_editor.placeholder_name}
-*   @{tf.contrib.graph_editor.make_placeholder_from_tensor}
-*   @{tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape}
+*   `tf.contrib.graph_editor.make_list_of_op`
+*   `tf.contrib.graph_editor.get_tensors`
+*   `tf.contrib.graph_editor.make_list_of_t`
+*   `tf.contrib.graph_editor.get_generating_ops`
+*   `tf.contrib.graph_editor.get_consuming_ops`
+*   `tf.contrib.graph_editor.ControlOutputs`
+*   `tf.contrib.graph_editor.placeholder_name`
+*   `tf.contrib.graph_editor.make_placeholder_from_tensor`
+*   `tf.contrib.graph_editor.make_placeholder_from_dtype_and_shape`
 
 ## Module: select
 
-*   @{tf.contrib.graph_editor.filter_ts}
-*   @{tf.contrib.graph_editor.filter_ts_from_regex}
-*   @{tf.contrib.graph_editor.filter_ops}
-*   @{tf.contrib.graph_editor.filter_ops_from_regex}
-*   @{tf.contrib.graph_editor.get_name_scope_ops}
-*   @{tf.contrib.graph_editor.check_cios}
-*   @{tf.contrib.graph_editor.get_ops_ios}
-*   @{tf.contrib.graph_editor.compute_boundary_ts}
-*   @{tf.contrib.graph_editor.get_within_boundary_ops}
-*   @{tf.contrib.graph_editor.get_forward_walk_ops}
-*   @{tf.contrib.graph_editor.get_backward_walk_ops}
-*   @{tf.contrib.graph_editor.get_walks_intersection_ops}
-*   @{tf.contrib.graph_editor.get_walks_union_ops}
-*   @{tf.contrib.graph_editor.select_ops}
-*   @{tf.contrib.graph_editor.select_ts}
-*   @{tf.contrib.graph_editor.select_ops_and_ts}
+*   `tf.contrib.graph_editor.filter_ts`
+*   `tf.contrib.graph_editor.filter_ts_from_regex`
+*   `tf.contrib.graph_editor.filter_ops`
+*   `tf.contrib.graph_editor.filter_ops_from_regex`
+*   `tf.contrib.graph_editor.get_name_scope_ops`
+*   `tf.contrib.graph_editor.check_cios`
+*   `tf.contrib.graph_editor.get_ops_ios`
+*   `tf.contrib.graph_editor.compute_boundary_ts`
+*   `tf.contrib.graph_editor.get_within_boundary_ops`
+*   `tf.contrib.graph_editor.get_forward_walk_ops`
+*   `tf.contrib.graph_editor.get_backward_walk_ops`
+*   `tf.contrib.graph_editor.get_walks_intersection_ops`
+*   `tf.contrib.graph_editor.get_walks_union_ops`
+*   `tf.contrib.graph_editor.select_ops`
+*   `tf.contrib.graph_editor.select_ts`
+*   `tf.contrib.graph_editor.select_ops_and_ts`
 
 ## Module: subgraph
 
-*   @{tf.contrib.graph_editor.SubGraphView}
-*   @{tf.contrib.graph_editor.make_view}
-*   @{tf.contrib.graph_editor.make_view_from_scope}
+*   `tf.contrib.graph_editor.SubGraphView`
+*   `tf.contrib.graph_editor.make_view`
+*   `tf.contrib.graph_editor.make_view_from_scope`
 
 ## Module: reroute
 
-*   @{tf.contrib.graph_editor.swap_ts}
-*   @{tf.contrib.graph_editor.reroute_ts}
-*   @{tf.contrib.graph_editor.swap_inputs}
-*   @{tf.contrib.graph_editor.reroute_inputs}
-*   @{tf.contrib.graph_editor.swap_outputs}
-*   @{tf.contrib.graph_editor.reroute_outputs}
-*   @{tf.contrib.graph_editor.swap_ios}
-*   @{tf.contrib.graph_editor.reroute_ios}
-*   @{tf.contrib.graph_editor.remove_control_inputs}
-*   @{tf.contrib.graph_editor.add_control_inputs}
+*   `tf.contrib.graph_editor.swap_ts`
+*   `tf.contrib.graph_editor.reroute_ts`
+*   `tf.contrib.graph_editor.swap_inputs`
+*   `tf.contrib.graph_editor.reroute_inputs`
+*   `tf.contrib.graph_editor.swap_outputs`
+*   `tf.contrib.graph_editor.reroute_outputs`
+*   `tf.contrib.graph_editor.swap_ios`
+*   `tf.contrib.graph_editor.reroute_ios`
+*   `tf.contrib.graph_editor.remove_control_inputs`
+*   `tf.contrib.graph_editor.add_control_inputs`
 
 ## Module: edit
 
-*   @{tf.contrib.graph_editor.detach_control_inputs}
-*   @{tf.contrib.graph_editor.detach_control_outputs}
-*   @{tf.contrib.graph_editor.detach_inputs}
-*   @{tf.contrib.graph_editor.detach_outputs}
-*   @{tf.contrib.graph_editor.detach}
-*   @{tf.contrib.graph_editor.connect}
-*   @{tf.contrib.graph_editor.bypass}
+*   `tf.contrib.graph_editor.detach_control_inputs`
+*   `tf.contrib.graph_editor.detach_control_outputs`
+*   `tf.contrib.graph_editor.detach_inputs`
+*   `tf.contrib.graph_editor.detach_outputs`
+*   `tf.contrib.graph_editor.detach`
+*   `tf.contrib.graph_editor.connect`
+*   `tf.contrib.graph_editor.bypass`
 
 ## Module: transform
 
-*   @{tf.contrib.graph_editor.replace_t_with_placeholder_handler}
-*   @{tf.contrib.graph_editor.keep_t_if_possible_handler}
-*   @{tf.contrib.graph_editor.assign_renamed_collections_handler}
-*   @{tf.contrib.graph_editor.transform_op_if_inside_handler}
-*   @{tf.contrib.graph_editor.copy_op_handler}
-*   @{tf.contrib.graph_editor.Transformer}
-*   @{tf.contrib.graph_editor.copy}
-*   @{tf.contrib.graph_editor.copy_with_input_replacements}
-*   @{tf.contrib.graph_editor.graph_replace}
+*   `tf.contrib.graph_editor.replace_t_with_placeholder_handler`
+*   `tf.contrib.graph_editor.keep_t_if_possible_handler`
+*   `tf.contrib.graph_editor.assign_renamed_collections_handler`
+*   `tf.contrib.graph_editor.transform_op_if_inside_handler`
+*   `tf.contrib.graph_editor.copy_op_handler`
+*   `tf.contrib.graph_editor.Transformer`
+*   `tf.contrib.graph_editor.copy`
+*   `tf.contrib.graph_editor.copy_with_input_replacements`
+*   `tf.contrib.graph_editor.graph_replace`
 
 ## Useful aliases
 
-*   @{tf.contrib.graph_editor.ph}
-*   @{tf.contrib.graph_editor.sgv}
-*   @{tf.contrib.graph_editor.sgv_scope}
+*   `tf.contrib.graph_editor.ph`
+*   `tf.contrib.graph_editor.sgv`
+*   `tf.contrib.graph_editor.sgv_scope`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.integrate.md b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
index e95b5a2..a70d202 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.integrate.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
@@ -38,4 +38,4 @@
 
 ## Ops
 
-*   @{tf.contrib.integrate.odeint}
+*   `tf.contrib.integrate.odeint`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md
index b85db4b..4c176a1 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.layers.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.layers.md
@@ -9,29 +9,29 @@
 used internally in a consistent way and provide the building blocks for many
 common machine learning algorithms.
 
-*   @{tf.contrib.layers.avg_pool2d}
-*   @{tf.contrib.layers.batch_norm}
-*   @{tf.contrib.layers.convolution2d}
-*   @{tf.contrib.layers.conv2d_in_plane}
-*   @{tf.contrib.layers.convolution2d_in_plane}
-*   @{tf.nn.conv2d_transpose}
-*   @{tf.contrib.layers.convolution2d_transpose}
-*   @{tf.nn.dropout}
-*   @{tf.contrib.layers.flatten}
-*   @{tf.contrib.layers.fully_connected}
-*   @{tf.contrib.layers.layer_norm}
-*   @{tf.contrib.layers.max_pool2d}
-*   @{tf.contrib.layers.one_hot_encoding}
-*   @{tf.nn.relu}
-*   @{tf.nn.relu6}
-*   @{tf.contrib.layers.repeat}
-*   @{tf.contrib.layers.safe_embedding_lookup_sparse}
-*   @{tf.nn.separable_conv2d}
-*   @{tf.contrib.layers.separable_convolution2d}
-*   @{tf.nn.softmax}
-*   @{tf.stack}
-*   @{tf.contrib.layers.unit_norm}
-*   @{tf.contrib.layers.embed_sequence}
+*   `tf.contrib.layers.avg_pool2d`
+*   `tf.contrib.layers.batch_norm`
+*   `tf.contrib.layers.convolution2d`
+*   `tf.contrib.layers.conv2d_in_plane`
+*   `tf.contrib.layers.convolution2d_in_plane`
+*   `tf.nn.conv2d_transpose`
+*   `tf.contrib.layers.convolution2d_transpose`
+*   `tf.nn.dropout`
+*   `tf.contrib.layers.flatten`
+*   `tf.contrib.layers.fully_connected`
+*   `tf.contrib.layers.layer_norm`
+*   `tf.contrib.layers.max_pool2d`
+*   `tf.contrib.layers.one_hot_encoding`
+*   `tf.nn.relu`
+*   `tf.nn.relu6`
+*   `tf.contrib.layers.repeat`
+*   `tf.contrib.layers.safe_embedding_lookup_sparse`
+*   `tf.nn.separable_conv2d`
+*   `tf.contrib.layers.separable_convolution2d`
+*   `tf.nn.softmax`
+*   `tf.stack`
+*   `tf.contrib.layers.unit_norm`
+*   `tf.contrib.layers.embed_sequence`
 
 Aliases for fully_connected which set a default activation function are
 available: `relu`, `relu6` and `linear`.
@@ -45,65 +45,65 @@
 `fn(weights)`. The loss is typically added to
 `tf.GraphKeys.REGULARIZATION_LOSSES`.
 
-*   @{tf.contrib.layers.apply_regularization}
-*   @{tf.contrib.layers.l1_regularizer}
-*   @{tf.contrib.layers.l2_regularizer}
-*   @{tf.contrib.layers.sum_regularizer}
+*   `tf.contrib.layers.apply_regularization`
+*   `tf.contrib.layers.l1_regularizer`
+*   `tf.contrib.layers.l2_regularizer`
+*   `tf.contrib.layers.sum_regularizer`
 
 ## Initializers
 
 Initializers are used to initialize variables with sensible values given their
 size, data type, and purpose.
 
-*   @{tf.contrib.layers.xavier_initializer}
-*   @{tf.contrib.layers.xavier_initializer_conv2d}
-*   @{tf.contrib.layers.variance_scaling_initializer}
+*   `tf.contrib.layers.xavier_initializer`
+*   `tf.contrib.layers.xavier_initializer_conv2d`
+*   `tf.contrib.layers.variance_scaling_initializer`
 
 ## Optimization
 
 Optimize weights given a loss.
 
-*   @{tf.contrib.layers.optimize_loss}
+*   `tf.contrib.layers.optimize_loss`
 
 ## Summaries
 
 Helper functions to summarize specific variables or ops.
 
-*   @{tf.contrib.layers.summarize_activation}
-*   @{tf.contrib.layers.summarize_tensor}
-*   @{tf.contrib.layers.summarize_tensors}
-*   @{tf.contrib.layers.summarize_collection}
+*   `tf.contrib.layers.summarize_activation`
+*   `tf.contrib.layers.summarize_tensor`
+*   `tf.contrib.layers.summarize_tensors`
+*   `tf.contrib.layers.summarize_collection`
 
 The layers module defines convenience functions `summarize_variables`,
 `summarize_weights` and `summarize_biases`, which set the `collection` argument
 of `summarize_collection` to `VARIABLES`, `WEIGHTS` and `BIASES`, respectively.
 
-*   @{tf.contrib.layers.summarize_activations}
+*   `tf.contrib.layers.summarize_activations`
 
 ## Feature columns
 
 Feature columns provide a mechanism to map data to a model.
 
-*   @{tf.contrib.layers.bucketized_column}
-*   @{tf.contrib.layers.check_feature_columns}
-*   @{tf.contrib.layers.create_feature_spec_for_parsing}
-*   @{tf.contrib.layers.crossed_column}
-*   @{tf.contrib.layers.embedding_column}
-*   @{tf.contrib.layers.scattered_embedding_column}
-*   @{tf.contrib.layers.input_from_feature_columns}
-*   @{tf.contrib.layers.joint_weighted_sum_from_feature_columns}
-*   @{tf.contrib.layers.make_place_holder_tensors_for_base_features}
-*   @{tf.contrib.layers.multi_class_target}
-*   @{tf.contrib.layers.one_hot_column}
-*   @{tf.contrib.layers.parse_feature_columns_from_examples}
-*   @{tf.contrib.layers.parse_feature_columns_from_sequence_examples}
-*   @{tf.contrib.layers.real_valued_column}
-*   @{tf.contrib.layers.shared_embedding_columns}
-*   @{tf.contrib.layers.sparse_column_with_hash_bucket}
-*   @{tf.contrib.layers.sparse_column_with_integerized_feature}
-*   @{tf.contrib.layers.sparse_column_with_keys}
-*   @{tf.contrib.layers.sparse_column_with_vocabulary_file}
-*   @{tf.contrib.layers.weighted_sparse_column}
-*   @{tf.contrib.layers.weighted_sum_from_feature_columns}
-*   @{tf.contrib.layers.infer_real_valued_columns}
-*   @{tf.contrib.layers.sequence_input_from_feature_columns}
+*   `tf.contrib.layers.bucketized_column`
+*   `tf.contrib.layers.check_feature_columns`
+*   `tf.contrib.layers.create_feature_spec_for_parsing`
+*   `tf.contrib.layers.crossed_column`
+*   `tf.contrib.layers.embedding_column`
+*   `tf.contrib.layers.scattered_embedding_column`
+*   `tf.contrib.layers.input_from_feature_columns`
+*   `tf.contrib.layers.joint_weighted_sum_from_feature_columns`
+*   `tf.contrib.layers.make_place_holder_tensors_for_base_features`
+*   `tf.contrib.layers.multi_class_target`
+*   `tf.contrib.layers.one_hot_column`
+*   `tf.contrib.layers.parse_feature_columns_from_examples`
+*   `tf.contrib.layers.parse_feature_columns_from_sequence_examples`
+*   `tf.contrib.layers.real_valued_column`
+*   `tf.contrib.layers.shared_embedding_columns`
+*   `tf.contrib.layers.sparse_column_with_hash_bucket`
+*   `tf.contrib.layers.sparse_column_with_integerized_feature`
+*   `tf.contrib.layers.sparse_column_with_keys`
+*   `tf.contrib.layers.sparse_column_with_vocabulary_file`
+*   `tf.contrib.layers.weighted_sparse_column`
+*   `tf.contrib.layers.weighted_sum_from_feature_columns`
+*   `tf.contrib.layers.infer_real_valued_columns`
+*   `tf.contrib.layers.sequence_input_from_feature_columns`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.learn.md b/tensorflow/docs_src/api_guides/python/contrib.learn.md
index 03838dc..635849e 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.learn.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.learn.md
@@ -7,57 +7,57 @@
 
 Train and evaluate TensorFlow models.
 
-*   @{tf.contrib.learn.BaseEstimator}
-*   @{tf.contrib.learn.Estimator}
-*   @{tf.contrib.learn.Trainable}
-*   @{tf.contrib.learn.Evaluable}
-*   @{tf.contrib.learn.KMeansClustering}
-*   @{tf.contrib.learn.ModeKeys}
-*   @{tf.contrib.learn.ModelFnOps}
-*   @{tf.contrib.learn.MetricSpec}
-*   @{tf.contrib.learn.PredictionKey}
-*   @{tf.contrib.learn.DNNClassifier}
-*   @{tf.contrib.learn.DNNRegressor}
-*   @{tf.contrib.learn.DNNLinearCombinedRegressor}
-*   @{tf.contrib.learn.DNNLinearCombinedClassifier}
-*   @{tf.contrib.learn.LinearClassifier}
-*   @{tf.contrib.learn.LinearRegressor}
-*   @{tf.contrib.learn.LogisticRegressor}
+*   `tf.contrib.learn.BaseEstimator`
+*   `tf.contrib.learn.Estimator`
+*   `tf.contrib.learn.Trainable`
+*   `tf.contrib.learn.Evaluable`
+*   `tf.contrib.learn.KMeansClustering`
+*   `tf.contrib.learn.ModeKeys`
+*   `tf.contrib.learn.ModelFnOps`
+*   `tf.contrib.learn.MetricSpec`
+*   `tf.contrib.learn.PredictionKey`
+*   `tf.contrib.learn.DNNClassifier`
+*   `tf.contrib.learn.DNNRegressor`
+*   `tf.contrib.learn.DNNLinearCombinedRegressor`
+*   `tf.contrib.learn.DNNLinearCombinedClassifier`
+*   `tf.contrib.learn.LinearClassifier`
+*   `tf.contrib.learn.LinearRegressor`
+*   `tf.contrib.learn.LogisticRegressor`
 
 ## Distributed training utilities
 
-*   @{tf.contrib.learn.Experiment}
-*   @{tf.contrib.learn.ExportStrategy}
-*   @{tf.contrib.learn.TaskType}
+*   `tf.contrib.learn.Experiment`
+*   `tf.contrib.learn.ExportStrategy`
+*   `tf.contrib.learn.TaskType`
 
 ## Graph actions
 
 Perform various training, evaluation, and inference actions on a graph.
 
-*   @{tf.train.NanLossDuringTrainingError}
-*   @{tf.contrib.learn.RunConfig}
-*   @{tf.contrib.learn.evaluate}
-*   @{tf.contrib.learn.infer}
-*   @{tf.contrib.learn.run_feeds}
-*   @{tf.contrib.learn.run_n}
-*   @{tf.contrib.learn.train}
+*   `tf.train.NanLossDuringTrainingError`
+*   `tf.contrib.learn.RunConfig`
+*   `tf.contrib.learn.evaluate`
+*   `tf.contrib.learn.infer`
+*   `tf.contrib.learn.run_feeds`
+*   `tf.contrib.learn.run_n`
+*   `tf.contrib.learn.train`
 
 ## Input processing
 
 Queue and read batched input data.
 
-*   @{tf.contrib.learn.extract_dask_data}
-*   @{tf.contrib.learn.extract_dask_labels}
-*   @{tf.contrib.learn.extract_pandas_data}
-*   @{tf.contrib.learn.extract_pandas_labels}
-*   @{tf.contrib.learn.extract_pandas_matrix}
-*   @{tf.contrib.learn.infer_real_valued_columns_from_input}
-*   @{tf.contrib.learn.infer_real_valued_columns_from_input_fn}
-*   @{tf.contrib.learn.read_batch_examples}
-*   @{tf.contrib.learn.read_batch_features}
-*   @{tf.contrib.learn.read_batch_record_features}
+*   `tf.contrib.learn.extract_dask_data`
+*   `tf.contrib.learn.extract_dask_labels`
+*   `tf.contrib.learn.extract_pandas_data`
+*   `tf.contrib.learn.extract_pandas_labels`
+*   `tf.contrib.learn.extract_pandas_matrix`
+*   `tf.contrib.learn.infer_real_valued_columns_from_input`
+*   `tf.contrib.learn.infer_real_valued_columns_from_input_fn`
+*   `tf.contrib.learn.read_batch_examples`
+*   `tf.contrib.learn.read_batch_features`
+*   `tf.contrib.learn.read_batch_record_features`
 
 Export utilities
 
-*   @{tf.contrib.learn.build_parsing_serving_input_fn}
-*   @{tf.contrib.learn.ProblemType}
+*   `tf.contrib.learn.build_parsing_serving_input_fn`
+*   `tf.contrib.learn.ProblemType`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
index c0cb2b1..3055449 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.linalg.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
@@ -14,17 +14,17 @@
 
 ### Base class
 
-*   @{tf.contrib.linalg.LinearOperator}
+*   `tf.contrib.linalg.LinearOperator`
 
 ### Individual operators
 
-*   @{tf.contrib.linalg.LinearOperatorDiag}
-*   @{tf.contrib.linalg.LinearOperatorIdentity}
-*   @{tf.contrib.linalg.LinearOperatorScaledIdentity}
-*   @{tf.contrib.linalg.LinearOperatorFullMatrix}
-*   @{tf.contrib.linalg.LinearOperatorLowerTriangular}
-*   @{tf.contrib.linalg.LinearOperatorLowRankUpdate}
+*   `tf.contrib.linalg.LinearOperatorDiag`
+*   `tf.contrib.linalg.LinearOperatorIdentity`
+*   `tf.contrib.linalg.LinearOperatorScaledIdentity`
+*   `tf.contrib.linalg.LinearOperatorFullMatrix`
+*   `tf.contrib.linalg.LinearOperatorLowerTriangular`
+*   `tf.contrib.linalg.LinearOperatorLowRankUpdate`
 
 ### Transformations and Combinations of operators
 
-*   @{tf.contrib.linalg.LinearOperatorComposition}
+*   `tf.contrib.linalg.LinearOperatorComposition`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index 8b74422..8787454 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -2,7 +2,7 @@
 
 ## Deprecated
 
-This module is deprecated. Instructions for updating: Use @{tf.losses} instead.
+This module is deprecated. Instructions for updating: Use `tf.losses` instead.
 
 ## Loss operations for use in neural networks.
 
@@ -107,19 +107,19 @@
   loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
-* @{tf.contrib.losses.absolute_difference}
-* @{tf.contrib.losses.add_loss}
-* @{tf.contrib.losses.hinge_loss}
-* @{tf.contrib.losses.compute_weighted_loss}
-* @{tf.contrib.losses.cosine_distance}
-* @{tf.contrib.losses.get_losses}
-* @{tf.contrib.losses.get_regularization_losses}
-* @{tf.contrib.losses.get_total_loss}
-* @{tf.contrib.losses.log_loss}
-* @{tf.contrib.losses.mean_pairwise_squared_error}
-* @{tf.contrib.losses.mean_squared_error}
-* @{tf.contrib.losses.sigmoid_cross_entropy}
-* @{tf.contrib.losses.softmax_cross_entropy}
-* @{tf.contrib.losses.sparse_softmax_cross_entropy}
+* `tf.contrib.losses.absolute_difference`
+* `tf.contrib.losses.add_loss`
+* `tf.contrib.losses.hinge_loss`
+* `tf.contrib.losses.compute_weighted_loss`
+* `tf.contrib.losses.cosine_distance`
+* `tf.contrib.losses.get_losses`
+* `tf.contrib.losses.get_regularization_losses`
+* `tf.contrib.losses.get_total_loss`
+* `tf.contrib.losses.log_loss`
+* `tf.contrib.losses.mean_pairwise_squared_error`
+* `tf.contrib.losses.mean_squared_error`
+* `tf.contrib.losses.sigmoid_cross_entropy`
+* `tf.contrib.losses.softmax_cross_entropy`
+* `tf.contrib.losses.sparse_softmax_cross_entropy`
 
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.metrics.md b/tensorflow/docs_src/api_guides/python/contrib.metrics.md
index 1eb9cf4..de6346c 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.metrics.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.metrics.md
@@ -86,48 +86,48 @@
 
 ## Metric `Ops`
 
-*   @{tf.contrib.metrics.streaming_accuracy}
-*   @{tf.contrib.metrics.streaming_mean}
-*   @{tf.contrib.metrics.streaming_recall}
-*   @{tf.contrib.metrics.streaming_recall_at_thresholds}
-*   @{tf.contrib.metrics.streaming_precision}
-*   @{tf.contrib.metrics.streaming_precision_at_thresholds}
-*   @{tf.contrib.metrics.streaming_auc}
-*   @{tf.contrib.metrics.streaming_recall_at_k}
-*   @{tf.contrib.metrics.streaming_mean_absolute_error}
-*   @{tf.contrib.metrics.streaming_mean_iou}
-*   @{tf.contrib.metrics.streaming_mean_relative_error}
-*   @{tf.contrib.metrics.streaming_mean_squared_error}
-*   @{tf.contrib.metrics.streaming_mean_tensor}
-*   @{tf.contrib.metrics.streaming_root_mean_squared_error}
-*   @{tf.contrib.metrics.streaming_covariance}
-*   @{tf.contrib.metrics.streaming_pearson_correlation}
-*   @{tf.contrib.metrics.streaming_mean_cosine_distance}
-*   @{tf.contrib.metrics.streaming_percentage_less}
-*   @{tf.contrib.metrics.streaming_sensitivity_at_specificity}
-*   @{tf.contrib.metrics.streaming_sparse_average_precision_at_k}
-*   @{tf.contrib.metrics.streaming_sparse_precision_at_k}
-*   @{tf.contrib.metrics.streaming_sparse_precision_at_top_k}
-*   @{tf.contrib.metrics.streaming_sparse_recall_at_k}
-*   @{tf.contrib.metrics.streaming_specificity_at_sensitivity}
-*   @{tf.contrib.metrics.streaming_concat}
-*   @{tf.contrib.metrics.streaming_false_negatives}
-*   @{tf.contrib.metrics.streaming_false_negatives_at_thresholds}
-*   @{tf.contrib.metrics.streaming_false_positives}
-*   @{tf.contrib.metrics.streaming_false_positives_at_thresholds}
-*   @{tf.contrib.metrics.streaming_true_negatives}
-*   @{tf.contrib.metrics.streaming_true_negatives_at_thresholds}
-*   @{tf.contrib.metrics.streaming_true_positives}
-*   @{tf.contrib.metrics.streaming_true_positives_at_thresholds}
-*   @{tf.contrib.metrics.auc_using_histogram}
-*   @{tf.contrib.metrics.accuracy}
-*   @{tf.contrib.metrics.aggregate_metrics}
-*   @{tf.contrib.metrics.aggregate_metric_map}
-*   @{tf.contrib.metrics.confusion_matrix}
+*   `tf.contrib.metrics.streaming_accuracy`
+*   `tf.contrib.metrics.streaming_mean`
+*   `tf.contrib.metrics.streaming_recall`
+*   `tf.contrib.metrics.streaming_recall_at_thresholds`
+*   `tf.contrib.metrics.streaming_precision`
+*   `tf.contrib.metrics.streaming_precision_at_thresholds`
+*   `tf.contrib.metrics.streaming_auc`
+*   `tf.contrib.metrics.streaming_recall_at_k`
+*   `tf.contrib.metrics.streaming_mean_absolute_error`
+*   `tf.contrib.metrics.streaming_mean_iou`
+*   `tf.contrib.metrics.streaming_mean_relative_error`
+*   `tf.contrib.metrics.streaming_mean_squared_error`
+*   `tf.contrib.metrics.streaming_mean_tensor`
+*   `tf.contrib.metrics.streaming_root_mean_squared_error`
+*   `tf.contrib.metrics.streaming_covariance`
+*   `tf.contrib.metrics.streaming_pearson_correlation`
+*   `tf.contrib.metrics.streaming_mean_cosine_distance`
+*   `tf.contrib.metrics.streaming_percentage_less`
+*   `tf.contrib.metrics.streaming_sensitivity_at_specificity`
+*   `tf.contrib.metrics.streaming_sparse_average_precision_at_k`
+*   `tf.contrib.metrics.streaming_sparse_precision_at_k`
+*   `tf.contrib.metrics.streaming_sparse_precision_at_top_k`
+*   `tf.contrib.metrics.streaming_sparse_recall_at_k`
+*   `tf.contrib.metrics.streaming_specificity_at_sensitivity`
+*   `tf.contrib.metrics.streaming_concat`
+*   `tf.contrib.metrics.streaming_false_negatives`
+*   `tf.contrib.metrics.streaming_false_negatives_at_thresholds`
+*   `tf.contrib.metrics.streaming_false_positives`
+*   `tf.contrib.metrics.streaming_false_positives_at_thresholds`
+*   `tf.contrib.metrics.streaming_true_negatives`
+*   `tf.contrib.metrics.streaming_true_negatives_at_thresholds`
+*   `tf.contrib.metrics.streaming_true_positives`
+*   `tf.contrib.metrics.streaming_true_positives_at_thresholds`
+*   `tf.contrib.metrics.auc_using_histogram`
+*   `tf.contrib.metrics.accuracy`
+*   `tf.contrib.metrics.aggregate_metrics`
+*   `tf.contrib.metrics.aggregate_metric_map`
+*   `tf.contrib.metrics.confusion_matrix`
 
 ## Set `Ops`
 
-*   @{tf.contrib.metrics.set_difference}
-*   @{tf.contrib.metrics.set_intersection}
-*   @{tf.contrib.metrics.set_size}
-*   @{tf.contrib.metrics.set_union}
+*   `tf.contrib.metrics.set_difference`
+*   `tf.contrib.metrics.set_intersection`
+*   `tf.contrib.metrics.set_size`
+*   `tf.contrib.metrics.set_union`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.rnn.md b/tensorflow/docs_src/api_guides/python/contrib.rnn.md
index d089b06..d265ab6 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.rnn.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.rnn.md
@@ -5,49 +5,49 @@
 
 ## Base interface for all RNN Cells
 
-*   @{tf.contrib.rnn.RNNCell}
+*   `tf.contrib.rnn.RNNCell`
 
 ## Core RNN Cells for use with TensorFlow's core RNN methods
 
-*   @{tf.contrib.rnn.BasicRNNCell}
-*   @{tf.contrib.rnn.BasicLSTMCell}
-*   @{tf.contrib.rnn.GRUCell}
-*   @{tf.contrib.rnn.LSTMCell}
-*   @{tf.contrib.rnn.LayerNormBasicLSTMCell}
+*   `tf.contrib.rnn.BasicRNNCell`
+*   `tf.contrib.rnn.BasicLSTMCell`
+*   `tf.contrib.rnn.GRUCell`
+*   `tf.contrib.rnn.LSTMCell`
+*   `tf.contrib.rnn.LayerNormBasicLSTMCell`
 
 ## Classes storing split `RNNCell` state
 
-*   @{tf.contrib.rnn.LSTMStateTuple}
+*   `tf.contrib.rnn.LSTMStateTuple`
 
 ## Core RNN Cell wrappers (RNNCells that wrap other RNNCells)
 
-*   @{tf.contrib.rnn.MultiRNNCell}
-*   @{tf.contrib.rnn.LSTMBlockWrapper}
-*   @{tf.contrib.rnn.DropoutWrapper}
-*   @{tf.contrib.rnn.EmbeddingWrapper}
-*   @{tf.contrib.rnn.InputProjectionWrapper}
-*   @{tf.contrib.rnn.OutputProjectionWrapper}
-*   @{tf.contrib.rnn.DeviceWrapper}
-*   @{tf.contrib.rnn.ResidualWrapper}
+*   `tf.contrib.rnn.MultiRNNCell`
+*   `tf.contrib.rnn.LSTMBlockWrapper`
+*   `tf.contrib.rnn.DropoutWrapper`
+*   `tf.contrib.rnn.EmbeddingWrapper`
+*   `tf.contrib.rnn.InputProjectionWrapper`
+*   `tf.contrib.rnn.OutputProjectionWrapper`
+*   `tf.contrib.rnn.DeviceWrapper`
+*   `tf.contrib.rnn.ResidualWrapper`
 
 ### Block RNNCells
-*   @{tf.contrib.rnn.LSTMBlockCell}
-*   @{tf.contrib.rnn.GRUBlockCell}
+*   `tf.contrib.rnn.LSTMBlockCell`
+*   `tf.contrib.rnn.GRUBlockCell`
 
 ### Fused RNNCells
-*   @{tf.contrib.rnn.FusedRNNCell}
-*   @{tf.contrib.rnn.FusedRNNCellAdaptor}
-*   @{tf.contrib.rnn.TimeReversedFusedRNN}
-*   @{tf.contrib.rnn.LSTMBlockFusedCell}
+*   `tf.contrib.rnn.FusedRNNCell`
+*   `tf.contrib.rnn.FusedRNNCellAdaptor`
+*   `tf.contrib.rnn.TimeReversedFusedRNN`
+*   `tf.contrib.rnn.LSTMBlockFusedCell`
 
 ### LSTM-like cells
-*   @{tf.contrib.rnn.CoupledInputForgetGateLSTMCell}
-*   @{tf.contrib.rnn.TimeFreqLSTMCell}
-*   @{tf.contrib.rnn.GridLSTMCell}
+*   `tf.contrib.rnn.CoupledInputForgetGateLSTMCell`
+*   `tf.contrib.rnn.TimeFreqLSTMCell`
+*   `tf.contrib.rnn.GridLSTMCell`
 
 ### RNNCell wrappers
-*   @{tf.contrib.rnn.AttentionCellWrapper}
-*   @{tf.contrib.rnn.CompiledWrapper}
+*   `tf.contrib.rnn.AttentionCellWrapper`
+*   `tf.contrib.rnn.CompiledWrapper`
 
 
 ## Recurrent Neural Networks
@@ -55,7 +55,7 @@
 TensorFlow provides a number of methods for constructing Recurrent Neural
 Networks.
 
-*   @{tf.contrib.rnn.static_rnn}
-*   @{tf.contrib.rnn.static_state_saving_rnn}
-*   @{tf.contrib.rnn.static_bidirectional_rnn}
-*   @{tf.contrib.rnn.stack_bidirectional_dynamic_rnn}
+*   `tf.contrib.rnn.static_rnn`
+*   `tf.contrib.rnn.static_state_saving_rnn`
+*   `tf.contrib.rnn.static_bidirectional_rnn`
+*   `tf.contrib.rnn.stack_bidirectional_dynamic_rnn`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index 143919f..54f2faf 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -2,18 +2,18 @@
 [TOC]
 
 Module for constructing seq2seq models and dynamic decoding.  Builds on top of
-libraries in @{tf.contrib.rnn}.
+libraries in `tf.contrib.rnn`.
 
 This library is composed of two primary components:
 
-*   New attention wrappers for @{tf.contrib.rnn.RNNCell} objects.
+*   New attention wrappers for `tf.contrib.rnn.RNNCell` objects.
 *   A new object-oriented dynamic decoding framework.
 
 ## Attention
 
 Attention wrappers are `RNNCell` objects that wrap other `RNNCell` objects and
 implement attention.  The form of attention is determined by a subclass of
-@{tf.contrib.seq2seq.AttentionMechanism}.  These subclasses describe the form
+`tf.contrib.seq2seq.AttentionMechanism`.  These subclasses describe the form
 of attention (e.g. additive vs. multiplicative) to use when creating the
 wrapper.  An instance of an `AttentionMechanism` is constructed with a
 `memory` tensor, from which lookup keys and values tensors are created.
@@ -22,9 +22,9 @@
 
 The two basic attention mechanisms are:
 
-*   @{tf.contrib.seq2seq.BahdanauAttention} (additive attention,
+*   `tf.contrib.seq2seq.BahdanauAttention` (additive attention,
     [ref.](https://arxiv.org/abs/1409.0473))
-*   @{tf.contrib.seq2seq.LuongAttention} (multiplicative attention,
+*   `tf.contrib.seq2seq.LuongAttention` (multiplicative attention,
     [ref.](https://arxiv.org/abs/1508.04025))
 
 The `memory` tensor passed the attention mechanism's constructor is expected to
@@ -41,7 +41,7 @@
 
 ### Attention Wrappers
 
-The basic attention wrapper is @{tf.contrib.seq2seq.AttentionWrapper}.
+The basic attention wrapper is `tf.contrib.seq2seq.AttentionWrapper`.
 This wrapper accepts an `RNNCell` instance, an instance of `AttentionMechanism`,
 and an attention depth parameter (`attention_size`); as well as several
 optional arguments that allow one to customize intermediate calculations.
@@ -120,19 +120,19 @@
 
 ### Decoder base class and functions
 
-*   @{tf.contrib.seq2seq.Decoder}
-*   @{tf.contrib.seq2seq.dynamic_decode}
+*   `tf.contrib.seq2seq.Decoder`
+*   `tf.contrib.seq2seq.dynamic_decode`
 
 ### Basic Decoder
 
-*   @{tf.contrib.seq2seq.BasicDecoderOutput}
-*   @{tf.contrib.seq2seq.BasicDecoder}
+*   `tf.contrib.seq2seq.BasicDecoderOutput`
+*   `tf.contrib.seq2seq.BasicDecoder`
 
 ### Decoder Helpers
 
-*   @{tf.contrib.seq2seq.Helper}
-*   @{tf.contrib.seq2seq.CustomHelper}
-*   @{tf.contrib.seq2seq.GreedyEmbeddingHelper}
-*   @{tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper}
-*   @{tf.contrib.seq2seq.ScheduledOutputTrainingHelper}
-*   @{tf.contrib.seq2seq.TrainingHelper}
+*   `tf.contrib.seq2seq.Helper`
+*   `tf.contrib.seq2seq.CustomHelper`
+*   `tf.contrib.seq2seq.GreedyEmbeddingHelper`
+*   `tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper`
+*   `tf.contrib.seq2seq.ScheduledOutputTrainingHelper`
+*   `tf.contrib.seq2seq.TrainingHelper`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.signal.md b/tensorflow/docs_src/api_guides/python/contrib.signal.md
index 0f7690f..66df561 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.signal.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.signal.md
@@ -1,7 +1,7 @@
 # Signal Processing (contrib)
 [TOC]
 
-@{tf.contrib.signal} is a module for signal processing primitives. All
+`tf.contrib.signal` is a module for signal processing primitives. All
 operations have GPU support and are differentiable. This module is especially
 helpful for building TensorFlow models that process or generate audio, though
 the techniques are useful in many domains.
@@ -10,7 +10,7 @@
 
 When dealing with variable length signals (e.g. audio) it is common to "frame"
 them into multiple fixed length windows. These windows can overlap if the 'step'
-of the frame is less than the frame length. @{tf.contrib.signal.frame} does
+of the frame is less than the frame length. `tf.contrib.signal.frame` does
 exactly this. For example:
 
 ```python
@@ -24,7 +24,7 @@
 frames = tf.contrib.signal.frame(signals, frame_length=128, frame_step=32)
 ```
 
-The `axis` parameter to @{tf.contrib.signal.frame} allows you to frame tensors
+The `axis` parameter to `tf.contrib.signal.frame` allows you to frame tensors
 with inner structure (e.g. a spectrogram):
 
 ```python
@@ -42,7 +42,7 @@
 
 ## Reconstructing framed sequences and applying a tapering window
 
-@{tf.contrib.signal.overlap_and_add} can be used to reconstruct a signal from a
+`tf.contrib.signal.overlap_and_add` can be used to reconstruct a signal from a
 framed representation. For example, the following code reconstructs the signal
 produced in the preceding example:
 
@@ -58,7 +58,7 @@
 window function satisfies the Constant Overlap-Add (COLA) property for the given
 frame step, then it will recover the original `signals`.
 
-@{tf.contrib.signal.hamming_window} and @{tf.contrib.signal.hann_window} both
+`tf.contrib.signal.hamming_window` and `tf.contrib.signal.hann_window` both
 satisfy the COLA property for a 75% overlap.
 
 ```python
@@ -74,7 +74,7 @@
 A spectrogram is a time-frequency decomposition of a signal that indicates its
 frequency content over time. The most common approach to computing spectrograms
 is to take the magnitude of the [Short-time Fourier Transform][stft] (STFT),
-which @{tf.contrib.signal.stft} can compute as follows:
+which `tf.contrib.signal.stft` can compute as follows:
 
 ```python
 # A batch of float32 time-domain signals in the range [-1, 1] with shape
@@ -121,7 +121,7 @@
 common reweighting of the frequency dimension, which results in a
 lower-dimensional and more perceptually-relevant representation of the audio.
 
-@{tf.contrib.signal.linear_to_mel_weight_matrix} produces a matrix you can use
+`tf.contrib.signal.linear_to_mel_weight_matrix` produces a matrix you can use
 to convert a spectrogram to the mel scale.
 
 ```python
@@ -156,7 +156,7 @@
 
 ## Computing Mel-Frequency Cepstral Coefficients (MFCCs)
 
-Call @{tf.contrib.signal.mfccs_from_log_mel_spectrograms} to compute
+Call `tf.contrib.signal.mfccs_from_log_mel_spectrograms` to compute
 [MFCCs][mfcc] from log-magnitude, mel-scale spectrograms (as computed in the
 preceding example):
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.staging.md b/tensorflow/docs_src/api_guides/python/contrib.staging.md
index b0ac548..de143a7 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.staging.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.staging.md
@@ -3,4 +3,4 @@
 
 This library contains utilities for adding pipelining to a model.
 
-*   @{tf.contrib.staging.StagingArea}
+*   `tf.contrib.staging.StagingArea`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.training.md b/tensorflow/docs_src/api_guides/python/contrib.training.md
index 87395d9..068efdc 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.training.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.training.md
@@ -5,46 +5,46 @@
 
 ## Splitting sequence inputs into minibatches with state saving
 
-Use @{tf.contrib.training.SequenceQueueingStateSaver} or
-its wrapper @{tf.contrib.training.batch_sequences_with_states} if
+Use `tf.contrib.training.SequenceQueueingStateSaver` or
+its wrapper `tf.contrib.training.batch_sequences_with_states` if
 you have input data with a dynamic primary time / frame count axis which
 you'd like to convert into fixed size segments during minibatching, and would
 like to store state in the forward direction across segments of an example.
 
-*   @{tf.contrib.training.batch_sequences_with_states}
-*   @{tf.contrib.training.NextQueuedSequenceBatch}
-*   @{tf.contrib.training.SequenceQueueingStateSaver}
+*   `tf.contrib.training.batch_sequences_with_states`
+*   `tf.contrib.training.NextQueuedSequenceBatch`
+*   `tf.contrib.training.SequenceQueueingStateSaver`
 
 
 ## Online data resampling
 
 To resample data with replacement on a per-example basis, use
-@{tf.contrib.training.rejection_sample} or
-@{tf.contrib.training.resample_at_rate}. For `rejection_sample`, provide
+`tf.contrib.training.rejection_sample` or
+`tf.contrib.training.resample_at_rate`. For `rejection_sample`, provide
 a boolean Tensor describing whether to accept or reject. Resulting batch sizes
 are always the same. For `resample_at_rate`, provide the desired rate for each
 example. Resulting batch sizes may vary. If you wish to specify relative
-rates, rather than absolute ones, use @{tf.contrib.training.weighted_resample}
+rates, rather than absolute ones, use `tf.contrib.training.weighted_resample`
 (which also returns the actual resampling rate used for each output example).
 
-Use @{tf.contrib.training.stratified_sample} to resample without replacement
+Use `tf.contrib.training.stratified_sample` to resample without replacement
 from the data to achieve a desired mix of class proportions that the Tensorflow
 graph sees. For instance, if you have a binary classification dataset that is
 99.9% class 1, a common approach is to resample from the data so that the data
 is more balanced.
 
-*   @{tf.contrib.training.rejection_sample}
-*   @{tf.contrib.training.resample_at_rate}
-*   @{tf.contrib.training.stratified_sample}
-*   @{tf.contrib.training.weighted_resample}
+*   `tf.contrib.training.rejection_sample`
+*   `tf.contrib.training.resample_at_rate`
+*   `tf.contrib.training.stratified_sample`
+*   `tf.contrib.training.weighted_resample`
 
 ## Bucketing
 
-Use @{tf.contrib.training.bucket} or
-@{tf.contrib.training.bucket_by_sequence_length} to stratify
+Use `tf.contrib.training.bucket` or
+`tf.contrib.training.bucket_by_sequence_length` to stratify
 minibatches into groups ("buckets").  Use `bucket_by_sequence_length`
 with the argument `dynamic_pad=True` to receive minibatches of similarly
 sized sequences for efficient training via `dynamic_rnn`.
 
-*   @{tf.contrib.training.bucket}
-*   @{tf.contrib.training.bucket_by_sequence_length}
+*   `tf.contrib.training.bucket`
+*   `tf.contrib.training.bucket_by_sequence_length`
diff --git a/tensorflow/docs_src/api_guides/python/contrib.util.md b/tensorflow/docs_src/api_guides/python/contrib.util.md
index 6bc120d..e5fd97e 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.util.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.util.md
@@ -5,8 +5,8 @@
 
 ## Miscellaneous Utility Functions
 
-*   @{tf.contrib.util.constant_value}
-*   @{tf.contrib.util.make_tensor_proto}
-*   @{tf.contrib.util.make_ndarray}
-*   @{tf.contrib.util.ops_used_by_graph_def}
-*   @{tf.contrib.util.stripped_op_list_for_graph}
+*   `tf.contrib.util.constant_value`
+*   `tf.contrib.util.make_tensor_proto`
+*   `tf.contrib.util.make_ndarray`
+*   `tf.contrib.util.ops_used_by_graph_def`
+*   `tf.contrib.util.stripped_op_list_for_graph`
diff --git a/tensorflow/docs_src/api_guides/python/control_flow_ops.md b/tensorflow/docs_src/api_guides/python/control_flow_ops.md
index 68ea96d..42c86d9 100644
--- a/tensorflow/docs_src/api_guides/python/control_flow_ops.md
+++ b/tensorflow/docs_src/api_guides/python/control_flow_ops.md
@@ -1,7 +1,7 @@
 # Control Flow
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -10,48 +10,48 @@
 TensorFlow provides several operations and classes that you can use to control
 the execution of operations and add conditional dependencies to your graph.
 
-*   @{tf.identity}
-*   @{tf.tuple}
-*   @{tf.group}
-*   @{tf.no_op}
-*   @{tf.count_up_to}
-*   @{tf.cond}
-*   @{tf.case}
-*   @{tf.while_loop}
+*   `tf.identity`
+*   `tf.tuple`
+*   `tf.group`
+*   `tf.no_op`
+*   `tf.count_up_to`
+*   `tf.cond`
+*   `tf.case`
+*   `tf.while_loop`
 
 ## Logical Operators
 
 TensorFlow provides several operations that you can use to add logical operators
 to your graph.
 
-*   @{tf.logical_and}
-*   @{tf.logical_not}
-*   @{tf.logical_or}
-*   @{tf.logical_xor}
+*   `tf.logical_and`
+*   `tf.logical_not`
+*   `tf.logical_or`
+*   `tf.logical_xor`
 
 ## Comparison Operators
 
 TensorFlow provides several operations that you can use to add comparison
 operators to your graph.
 
-*   @{tf.equal}
-*   @{tf.not_equal}
-*   @{tf.less}
-*   @{tf.less_equal}
-*   @{tf.greater}
-*   @{tf.greater_equal}
-*   @{tf.where}
+*   `tf.equal`
+*   `tf.not_equal`
+*   `tf.less`
+*   `tf.less_equal`
+*   `tf.greater`
+*   `tf.greater_equal`
+*   `tf.where`
 
 ## Debugging Operations
 
 TensorFlow provides several operations that you can use to validate values and
 debug your graph.
 
-*   @{tf.is_finite}
-*   @{tf.is_inf}
-*   @{tf.is_nan}
-*   @{tf.verify_tensor_all_finite}
-*   @{tf.check_numerics}
-*   @{tf.add_check_numerics_ops}
-*   @{tf.Assert}
-*   @{tf.Print}
+*   `tf.is_finite`
+*   `tf.is_inf`
+*   `tf.is_nan`
+*   `tf.verify_tensor_all_finite`
+*   `tf.check_numerics`
+*   `tf.add_check_numerics_ops`
+*   `tf.Assert`
+*   `tf.Print`
diff --git a/tensorflow/docs_src/api_guides/python/framework.md b/tensorflow/docs_src/api_guides/python/framework.md
index 42c3e57..40a6c07 100644
--- a/tensorflow/docs_src/api_guides/python/framework.md
+++ b/tensorflow/docs_src/api_guides/python/framework.md
@@ -5,47 +5,47 @@
 
 ## Core graph data structures
 
-*   @{tf.Graph}
-*   @{tf.Operation}
-*   @{tf.Tensor}
+*   `tf.Graph`
+*   `tf.Operation`
+*   `tf.Tensor`
 
 ## Tensor types
 
-*   @{tf.DType}
-*   @{tf.as_dtype}
+*   `tf.DType`
+*   `tf.as_dtype`
 
 ## Utility functions
 
-*   @{tf.device}
-*   @{tf.container}
-*   @{tf.name_scope}
-*   @{tf.control_dependencies}
-*   @{tf.convert_to_tensor}
-*   @{tf.convert_to_tensor_or_indexed_slices}
-*   @{tf.convert_to_tensor_or_sparse_tensor}
-*   @{tf.get_default_graph}
-*   @{tf.reset_default_graph}
-*   @{tf.import_graph_def}
-*   @{tf.load_file_system_library}
-*   @{tf.load_op_library}
+*   `tf.device`
+*   `tf.container`
+*   `tf.name_scope`
+*   `tf.control_dependencies`
+*   `tf.convert_to_tensor`
+*   `tf.convert_to_tensor_or_indexed_slices`
+*   `tf.convert_to_tensor_or_sparse_tensor`
+*   `tf.get_default_graph`
+*   `tf.reset_default_graph`
+*   `tf.import_graph_def`
+*   `tf.load_file_system_library`
+*   `tf.load_op_library`
 
 ## Graph collections
 
-*   @{tf.add_to_collection}
-*   @{tf.get_collection}
-*   @{tf.get_collection_ref}
-*   @{tf.GraphKeys}
+*   `tf.add_to_collection`
+*   `tf.get_collection`
+*   `tf.get_collection_ref`
+*   `tf.GraphKeys`
 
 ## Defining new operations
 
-*   @{tf.RegisterGradient}
-*   @{tf.NotDifferentiable}
-*   @{tf.NoGradient}
-*   @{tf.TensorShape}
-*   @{tf.Dimension}
-*   @{tf.op_scope}
-*   @{tf.get_seed}
+*   `tf.RegisterGradient`
+*   `tf.NotDifferentiable`
+*   `tf.NoGradient`
+*   `tf.TensorShape`
+*   `tf.Dimension`
+*   `tf.op_scope`
+*   `tf.get_seed`
 
 ## For libraries building on TensorFlow
 
-*   @{tf.register_tensor_conversion_function}
+*   `tf.register_tensor_conversion_function`
diff --git a/tensorflow/docs_src/api_guides/python/functional_ops.md b/tensorflow/docs_src/api_guides/python/functional_ops.md
index 9fd4606..0a9fe02 100644
--- a/tensorflow/docs_src/api_guides/python/functional_ops.md
+++ b/tensorflow/docs_src/api_guides/python/functional_ops.md
@@ -1,7 +1,7 @@
 # Higher Order Functions
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -12,7 +12,7 @@
 TensorFlow provides several higher order operators to simplify the common
 map-reduce programming patterns.
 
-*   @{tf.map_fn}
-*   @{tf.foldl}
-*   @{tf.foldr}
-*   @{tf.scan}
+*   `tf.map_fn`
+*   `tf.foldl`
+*   `tf.foldr`
+*   `tf.scan`
diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
index 051e454..c51b92d 100644
--- a/tensorflow/docs_src/api_guides/python/image.md
+++ b/tensorflow/docs_src/api_guides/python/image.md
@@ -1,7 +1,7 @@
 # Images
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -19,27 +19,27 @@
 presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
 to be stripped from the image and re-attached using slicing ops.
 
-*   @{tf.image.decode_bmp}
-*   @{tf.image.decode_gif}
-*   @{tf.image.decode_jpeg}
-*   @{tf.image.encode_jpeg}
-*   @{tf.image.decode_png}
-*   @{tf.image.encode_png}
-*   @{tf.image.decode_image}
+*   `tf.image.decode_bmp`
+*   `tf.image.decode_gif`
+*   `tf.image.decode_jpeg`
+*   `tf.image.encode_jpeg`
+*   `tf.image.decode_png`
+*   `tf.image.encode_png`
+*   `tf.image.decode_image`
 
 ## Resizing
 
 The resizing Ops accept input images as tensors of several types.  They always
 output resized images as float32 tensors.
 
-The convenience function @{tf.image.resize_images} supports both 4-D
+The convenience function `tf.image.resize_images` supports both 4-D
 and 3-D tensors as input and output.  4-D tensors are for batches of images,
 3-D tensors for individual images.
 
 Other resizing Ops only support 4-D batches of images as input:
-@{tf.image.resize_area}, @{tf.image.resize_bicubic},
-@{tf.image.resize_bilinear},
-@{tf.image.resize_nearest_neighbor}.
+`tf.image.resize_area`, `tf.image.resize_bicubic`,
+`tf.image.resize_bilinear`,
+`tf.image.resize_nearest_neighbor`.
 
 Example:
 
@@ -49,29 +49,29 @@
 resized_image = tf.image.resize_images(image, [299, 299])
 ```
 
-*   @{tf.image.resize_images}
-*   @{tf.image.resize_area}
-*   @{tf.image.resize_bicubic}
-*   @{tf.image.resize_bilinear}
-*   @{tf.image.resize_nearest_neighbor}
+*   `tf.image.resize_images`
+*   `tf.image.resize_area`
+*   `tf.image.resize_bicubic`
+*   `tf.image.resize_bilinear`
+*   `tf.image.resize_nearest_neighbor`
 
 ## Cropping
 
-*   @{tf.image.resize_image_with_crop_or_pad}
-*   @{tf.image.central_crop}
-*   @{tf.image.pad_to_bounding_box}
-*   @{tf.image.crop_to_bounding_box}
-*   @{tf.image.extract_glimpse}
-*   @{tf.image.crop_and_resize}
+*   `tf.image.resize_image_with_crop_or_pad`
+*   `tf.image.central_crop`
+*   `tf.image.pad_to_bounding_box`
+*   `tf.image.crop_to_bounding_box`
+*   `tf.image.extract_glimpse`
+*   `tf.image.crop_and_resize`
 
 ## Flipping, Rotating and Transposing
 
-*   @{tf.image.flip_up_down}
-*   @{tf.image.random_flip_up_down}
-*   @{tf.image.flip_left_right}
-*   @{tf.image.random_flip_left_right}
-*   @{tf.image.transpose_image}
-*   @{tf.image.rot90}
+*   `tf.image.flip_up_down`
+*   `tf.image.random_flip_up_down`
+*   `tf.image.flip_left_right`
+*   `tf.image.random_flip_left_right`
+*   `tf.image.transpose_image`
+*   `tf.image.rot90`
 
 ## Converting Between Colorspaces
 
@@ -94,7 +94,7 @@
 
 TensorFlow can convert between images in RGB or HSV. The conversion functions
 work only on float images, so you need to convert images in other formats using
-@{tf.image.convert_image_dtype}.
+`tf.image.convert_image_dtype`.
 
 Example:
 
@@ -105,11 +105,11 @@
 hsv_image = tf.image.rgb_to_hsv(rgb_image)
 ```
 
-*   @{tf.image.rgb_to_grayscale}
-*   @{tf.image.grayscale_to_rgb}
-*   @{tf.image.hsv_to_rgb}
-*   @{tf.image.rgb_to_hsv}
-*   @{tf.image.convert_image_dtype}
+*   `tf.image.rgb_to_grayscale`
+*   `tf.image.grayscale_to_rgb`
+*   `tf.image.hsv_to_rgb`
+*   `tf.image.rgb_to_hsv`
+*   `tf.image.convert_image_dtype`
 
 ## Image Adjustments
 
@@ -122,23 +122,23 @@
 redundant conversions by first converting the images to the most natural data
 type and representation (RGB or HSV).
 
-*   @{tf.image.adjust_brightness}
-*   @{tf.image.random_brightness}
-*   @{tf.image.adjust_contrast}
-*   @{tf.image.random_contrast}
-*   @{tf.image.adjust_hue}
-*   @{tf.image.random_hue}
-*   @{tf.image.adjust_gamma}
-*   @{tf.image.adjust_saturation}
-*   @{tf.image.random_saturation}
-*   @{tf.image.per_image_standardization}
+*   `tf.image.adjust_brightness`
+*   `tf.image.random_brightness`
+*   `tf.image.adjust_contrast`
+*   `tf.image.random_contrast`
+*   `tf.image.adjust_hue`
+*   `tf.image.random_hue`
+*   `tf.image.adjust_gamma`
+*   `tf.image.adjust_saturation`
+*   `tf.image.random_saturation`
+*   `tf.image.per_image_standardization`
 
 ## Working with Bounding Boxes
 
-*   @{tf.image.draw_bounding_boxes}
-*   @{tf.image.non_max_suppression}
-*   @{tf.image.sample_distorted_bounding_box}
+*   `tf.image.draw_bounding_boxes`
+*   `tf.image.non_max_suppression`
+*   `tf.image.sample_distorted_bounding_box`
 
 ## Denoising
 
-*   @{tf.image.total_variation}
+*   `tf.image.total_variation`
diff --git a/tensorflow/docs_src/api_guides/python/input_dataset.md b/tensorflow/docs_src/api_guides/python/input_dataset.md
index a6612d1..911a76c 100644
--- a/tensorflow/docs_src/api_guides/python/input_dataset.md
+++ b/tensorflow/docs_src/api_guides/python/input_dataset.md
@@ -1,27 +1,27 @@
 # Dataset Input Pipeline
 [TOC]
 
-@{tf.data.Dataset} allows you to build complex input pipelines. See the
-@{$guide/datasets} for an in-depth explanation of how to use this API.
+`tf.data.Dataset` allows you to build complex input pipelines. See the
+[Importing Data](../../guide/datasets.md) for an in-depth explanation of how to use this API.
 
 ## Reader classes
 
 Classes that create a dataset from input files.
 
-*   @{tf.data.FixedLengthRecordDataset}
-*   @{tf.data.TextLineDataset}
-*   @{tf.data.TFRecordDataset}
+*   `tf.data.FixedLengthRecordDataset`
+*   `tf.data.TextLineDataset`
+*   `tf.data.TFRecordDataset`
 
 ## Creating new datasets
 
 Static methods in `Dataset` that create new datasets.
 
-*   @{tf.data.Dataset.from_generator}
-*   @{tf.data.Dataset.from_tensor_slices}
-*   @{tf.data.Dataset.from_tensors}
-*   @{tf.data.Dataset.list_files}
-*   @{tf.data.Dataset.range}
-*   @{tf.data.Dataset.zip}
+*   `tf.data.Dataset.from_generator`
+*   `tf.data.Dataset.from_tensor_slices`
+*   `tf.data.Dataset.from_tensors`
+*   `tf.data.Dataset.list_files`
+*   `tf.data.Dataset.range`
+*   `tf.data.Dataset.zip`
 
 ## Transformations on existing datasets
 
@@ -32,54 +32,54 @@
 train_data = train_data.batch(100).shuffle().repeat()
 ```
 
-*   @{tf.data.Dataset.apply}
-*   @{tf.data.Dataset.batch}
-*   @{tf.data.Dataset.cache}
-*   @{tf.data.Dataset.concatenate}
-*   @{tf.data.Dataset.filter}
-*   @{tf.data.Dataset.flat_map}
-*   @{tf.data.Dataset.interleave}
-*   @{tf.data.Dataset.map}
-*   @{tf.data.Dataset.padded_batch}
-*   @{tf.data.Dataset.prefetch}
-*   @{tf.data.Dataset.repeat}
-*   @{tf.data.Dataset.shard}
-*   @{tf.data.Dataset.shuffle}
-*   @{tf.data.Dataset.skip}
-*   @{tf.data.Dataset.take}
+*   `tf.data.Dataset.apply`
+*   `tf.data.Dataset.batch`
+*   `tf.data.Dataset.cache`
+*   `tf.data.Dataset.concatenate`
+*   `tf.data.Dataset.filter`
+*   `tf.data.Dataset.flat_map`
+*   `tf.data.Dataset.interleave`
+*   `tf.data.Dataset.map`
+*   `tf.data.Dataset.padded_batch`
+*   `tf.data.Dataset.prefetch`
+*   `tf.data.Dataset.repeat`
+*   `tf.data.Dataset.shard`
+*   `tf.data.Dataset.shuffle`
+*   `tf.data.Dataset.skip`
+*   `tf.data.Dataset.take`
 
 ### Custom transformation functions
 
-Custom transformation functions can be applied to a `Dataset` using @{tf.data.Dataset.apply}. Below are custom transformation functions from `tf.contrib.data`:
+Custom transformation functions can be applied to a `Dataset` using `tf.data.Dataset.apply`. Below are custom transformation functions from `tf.contrib.data`:
 
-*   @{tf.contrib.data.batch_and_drop_remainder}
-*   @{tf.contrib.data.dense_to_sparse_batch}
-*   @{tf.contrib.data.enumerate_dataset}
-*   @{tf.contrib.data.group_by_window}
-*   @{tf.contrib.data.ignore_errors}
-*   @{tf.contrib.data.map_and_batch}
-*   @{tf.contrib.data.padded_batch_and_drop_remainder}
-*   @{tf.contrib.data.parallel_interleave}
-*   @{tf.contrib.data.rejection_resample}
-*   @{tf.contrib.data.scan}
-*   @{tf.contrib.data.shuffle_and_repeat}
-*   @{tf.contrib.data.unbatch}
+*   `tf.contrib.data.batch_and_drop_remainder`
+*   `tf.contrib.data.dense_to_sparse_batch`
+*   `tf.contrib.data.enumerate_dataset`
+*   `tf.contrib.data.group_by_window`
+*   `tf.contrib.data.ignore_errors`
+*   `tf.contrib.data.map_and_batch`
+*   `tf.contrib.data.padded_batch_and_drop_remainder`
+*   `tf.contrib.data.parallel_interleave`
+*   `tf.contrib.data.rejection_resample`
+*   `tf.contrib.data.scan`
+*   `tf.contrib.data.shuffle_and_repeat`
+*   `tf.contrib.data.unbatch`
 
 ## Iterating over datasets
 
-These functions make a @{tf.data.Iterator} from a `Dataset`.
+These functions make a `tf.data.Iterator` from a `Dataset`.
 
-*   @{tf.data.Dataset.make_initializable_iterator}
-*   @{tf.data.Dataset.make_one_shot_iterator}
+*   `tf.data.Dataset.make_initializable_iterator`
+*   `tf.data.Dataset.make_one_shot_iterator`
 
-The `Iterator` class also contains static methods that create a @{tf.data.Iterator} that can be used with multiple `Dataset` objects.
+The `Iterator` class also contains static methods that create a `tf.data.Iterator` that can be used with multiple `Dataset` objects.
 
-*   @{tf.data.Iterator.from_structure}
-*   @{tf.data.Iterator.from_string_handle}
+*   `tf.data.Iterator.from_structure`
+*   `tf.data.Iterator.from_string_handle`
 
 ## Extra functions from `tf.contrib.data`
 
-*   @{tf.contrib.data.get_single_element}
-*   @{tf.contrib.data.make_saveable_from_iterator}
-*   @{tf.contrib.data.read_batch_features}
+*   `tf.contrib.data.get_single_element`
+*   `tf.contrib.data.make_saveable_from_iterator`
+*   `tf.contrib.data.read_batch_features`
 
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index 86b4b39..d7ce6fd 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -1,91 +1,91 @@
 # Inputs and Readers
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
+on execution.  For more info, see the section on [Feeding data](../../api_guides/python/reading_data.md#Feeding).
 
-*   @{tf.placeholder}
-*   @{tf.placeholder_with_default}
+*   `tf.placeholder`
+*   `tf.placeholder_with_default`
 
 For feeding `SparseTensor`s which are composite type,
 there is a convenience function:
 
-*   @{tf.sparse_placeholder}
+*   `tf.sparse_placeholder`
 
 ## Readers
 
 TensorFlow provides a set of Reader classes for reading data formats.
-For more information on inputs and readers, see @{$reading_data$Reading data}.
+For more information on inputs and readers, see [Reading data](../../api_guides/python/reading_data.md).
 
-*   @{tf.ReaderBase}
-*   @{tf.TextLineReader}
-*   @{tf.WholeFileReader}
-*   @{tf.IdentityReader}
-*   @{tf.TFRecordReader}
-*   @{tf.FixedLengthRecordReader}
+*   `tf.ReaderBase`
+*   `tf.TextLineReader`
+*   `tf.WholeFileReader`
+*   `tf.IdentityReader`
+*   `tf.TFRecordReader`
+*   `tf.FixedLengthRecordReader`
 
 ## Converting
 
 TensorFlow provides several operations that you can use to convert various data
 formats into tensors.
 
-*   @{tf.decode_csv}
-*   @{tf.decode_raw}
+*   `tf.decode_csv`
+*   `tf.decode_raw`
 
 - - -
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
+TensorFlow's [recommended format for training examples](../../api_guides/python/reading_data.md#standard_tensorflow_format)
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
 
-*   @{tf.VarLenFeature}
-*   @{tf.FixedLenFeature}
-*   @{tf.FixedLenSequenceFeature}
-*   @{tf.SparseFeature}
-*   @{tf.parse_example}
-*   @{tf.parse_single_example}
-*   @{tf.parse_tensor}
-*   @{tf.decode_json_example}
+*   `tf.VarLenFeature`
+*   `tf.FixedLenFeature`
+*   `tf.FixedLenSequenceFeature`
+*   `tf.SparseFeature`
+*   `tf.parse_example`
+*   `tf.parse_single_example`
+*   `tf.parse_tensor`
+*   `tf.decode_json_example`
 
 ## Queues
 
 TensorFlow provides several implementations of 'Queues', which are
 structures within the TensorFlow computation graph to stage pipelines
 of tensors together. The following describe the basic Queue interface
-and some implementations.  To see an example use, see @{$threading_and_queues$Threading and Queues}.
+and some implementations.  To see an example use, see [Threading and Queues](../../api_guides/python/threading_and_queues.md).
 
-*   @{tf.QueueBase}
-*   @{tf.FIFOQueue}
-*   @{tf.PaddingFIFOQueue}
-*   @{tf.RandomShuffleQueue}
-*   @{tf.PriorityQueue}
+*   `tf.QueueBase`
+*   `tf.FIFOQueue`
+*   `tf.PaddingFIFOQueue`
+*   `tf.RandomShuffleQueue`
+*   `tf.PriorityQueue`
 
 ## Conditional Accumulators
 
-*   @{tf.ConditionalAccumulatorBase}
-*   @{tf.ConditionalAccumulator}
-*   @{tf.SparseConditionalAccumulator}
+*   `tf.ConditionalAccumulatorBase`
+*   `tf.ConditionalAccumulator`
+*   `tf.SparseConditionalAccumulator`
 
 ## Dealing with the filesystem
 
-*   @{tf.matching_files}
-*   @{tf.read_file}
-*   @{tf.write_file}
+*   `tf.matching_files`
+*   `tf.read_file`
+*   `tf.write_file`
 
 ## Input pipeline
 
 TensorFlow functions for setting up an input-prefetching pipeline.
-Please see the @{$reading_data$reading data how-to}
+Please see the [reading data how-to](../../api_guides/python/reading_data.md)
 for context.
 
 ### Beginning of an input pipeline
@@ -93,12 +93,12 @@
 The "producer" functions add a queue to the graph and a corresponding
 `QueueRunner` for running the subgraph that fills that queue.
 
-*   @{tf.train.match_filenames_once}
-*   @{tf.train.limit_epochs}
-*   @{tf.train.input_producer}
-*   @{tf.train.range_input_producer}
-*   @{tf.train.slice_input_producer}
-*   @{tf.train.string_input_producer}
+*   `tf.train.match_filenames_once`
+*   `tf.train.limit_epochs`
+*   `tf.train.input_producer`
+*   `tf.train.range_input_producer`
+*   `tf.train.slice_input_producer`
+*   `tf.train.string_input_producer`
 
 ### Batching at the end of an input pipeline
 
@@ -106,25 +106,25 @@
 examples, with possible shuffling.  They also add a `QueueRunner` for
 running the subgraph that fills that queue.
 
-Use @{tf.train.batch} or @{tf.train.batch_join} for batching
+Use `tf.train.batch` or `tf.train.batch_join` for batching
 examples that have already been well shuffled.  Use
-@{tf.train.shuffle_batch} or
-@{tf.train.shuffle_batch_join} for examples that would
+`tf.train.shuffle_batch` or
+`tf.train.shuffle_batch_join` for examples that would
 benefit from additional shuffling.
 
-Use @{tf.train.batch} or @{tf.train.shuffle_batch} if you want a
+Use `tf.train.batch` or `tf.train.shuffle_batch` if you want a
 single thread producing examples to batch, or if you have a
 single subgraph producing examples but you want to run it in *N* threads
 (where you increase *N* until it can keep the queue full).  Use
-@{tf.train.batch_join} or @{tf.train.shuffle_batch_join}
+`tf.train.batch_join` or `tf.train.shuffle_batch_join`
 if you have *N* different subgraphs producing examples to batch and you
 want them run by *N* threads. Use `maybe_*` to enqueue conditionally.
 
-*   @{tf.train.batch}
-*   @{tf.train.maybe_batch}
-*   @{tf.train.batch_join}
-*   @{tf.train.maybe_batch_join}
-*   @{tf.train.shuffle_batch}
-*   @{tf.train.maybe_shuffle_batch}
-*   @{tf.train.shuffle_batch_join}
-*   @{tf.train.maybe_shuffle_batch_join}
+*   `tf.train.batch`
+*   `tf.train.maybe_batch`
+*   `tf.train.batch_join`
+*   `tf.train.maybe_batch_join`
+*   `tf.train.shuffle_batch`
+*   `tf.train.maybe_shuffle_batch`
+*   `tf.train.shuffle_batch_join`
+*   `tf.train.maybe_shuffle_batch_join`
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
index dee7f16..e738161 100644
--- a/tensorflow/docs_src/api_guides/python/math_ops.md
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -1,7 +1,7 @@
 # Math
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -13,97 +13,97 @@
 TensorFlow provides several operations that you can use to add basic arithmetic
 operators to your graph.
 
-*   @{tf.add}
-*   @{tf.subtract}
-*   @{tf.multiply}
-*   @{tf.scalar_mul}
-*   @{tf.div}
-*   @{tf.divide}
-*   @{tf.truediv}
-*   @{tf.floordiv}
-*   @{tf.realdiv}
-*   @{tf.truncatediv}
-*   @{tf.floor_div}
-*   @{tf.truncatemod}
-*   @{tf.floormod}
-*   @{tf.mod}
-*   @{tf.cross}
+*   `tf.add`
+*   `tf.subtract`
+*   `tf.multiply`
+*   `tf.scalar_mul`
+*   `tf.div`
+*   `tf.divide`
+*   `tf.truediv`
+*   `tf.floordiv`
+*   `tf.realdiv`
+*   `tf.truncatediv`
+*   `tf.floor_div`
+*   `tf.truncatemod`
+*   `tf.floormod`
+*   `tf.mod`
+*   `tf.cross`
 
 ## Basic Math Functions
 
 TensorFlow provides several operations that you can use to add basic
 mathematical functions to your graph.
 
-*   @{tf.add_n}
-*   @{tf.abs}
-*   @{tf.negative}
-*   @{tf.sign}
-*   @{tf.reciprocal}
-*   @{tf.square}
-*   @{tf.round}
-*   @{tf.sqrt}
-*   @{tf.rsqrt}
-*   @{tf.pow}
-*   @{tf.exp}
-*   @{tf.expm1}
-*   @{tf.log}
-*   @{tf.log1p}
-*   @{tf.ceil}
-*   @{tf.floor}
-*   @{tf.maximum}
-*   @{tf.minimum}
-*   @{tf.cos}
-*   @{tf.sin}
-*   @{tf.lbeta}
-*   @{tf.tan}
-*   @{tf.acos}
-*   @{tf.asin}
-*   @{tf.atan}
-*   @{tf.cosh}
-*   @{tf.sinh}
-*   @{tf.asinh}
-*   @{tf.acosh}
-*   @{tf.atanh}
-*   @{tf.lgamma}
-*   @{tf.digamma}
-*   @{tf.erf}
-*   @{tf.erfc}
-*   @{tf.squared_difference}
-*   @{tf.igamma}
-*   @{tf.igammac}
-*   @{tf.zeta}
-*   @{tf.polygamma}
-*   @{tf.betainc}
-*   @{tf.rint}
+*   `tf.add_n`
+*   `tf.abs`
+*   `tf.negative`
+*   `tf.sign`
+*   `tf.reciprocal`
+*   `tf.square`
+*   `tf.round`
+*   `tf.sqrt`
+*   `tf.rsqrt`
+*   `tf.pow`
+*   `tf.exp`
+*   `tf.expm1`
+*   `tf.log`
+*   `tf.log1p`
+*   `tf.ceil`
+*   `tf.floor`
+*   `tf.maximum`
+*   `tf.minimum`
+*   `tf.cos`
+*   `tf.sin`
+*   `tf.lbeta`
+*   `tf.tan`
+*   `tf.acos`
+*   `tf.asin`
+*   `tf.atan`
+*   `tf.cosh`
+*   `tf.sinh`
+*   `tf.asinh`
+*   `tf.acosh`
+*   `tf.atanh`
+*   `tf.lgamma`
+*   `tf.digamma`
+*   `tf.erf`
+*   `tf.erfc`
+*   `tf.squared_difference`
+*   `tf.igamma`
+*   `tf.igammac`
+*   `tf.zeta`
+*   `tf.polygamma`
+*   `tf.betainc`
+*   `tf.rint`
 
 ## Matrix Math Functions
 
 TensorFlow provides several operations that you can use to add linear algebra
 functions on matrices to your graph.
 
-*   @{tf.diag}
-*   @{tf.diag_part}
-*   @{tf.trace}
-*   @{tf.transpose}
-*   @{tf.eye}
-*   @{tf.matrix_diag}
-*   @{tf.matrix_diag_part}
-*   @{tf.matrix_band_part}
-*   @{tf.matrix_set_diag}
-*   @{tf.matrix_transpose}
-*   @{tf.matmul}
-*   @{tf.norm}
-*   @{tf.matrix_determinant}
-*   @{tf.matrix_inverse}
-*   @{tf.cholesky}
-*   @{tf.cholesky_solve}
-*   @{tf.matrix_solve}
-*   @{tf.matrix_triangular_solve}
-*   @{tf.matrix_solve_ls}
-*   @{tf.qr}
-*   @{tf.self_adjoint_eig}
-*   @{tf.self_adjoint_eigvals}
-*   @{tf.svd}
+*   `tf.diag`
+*   `tf.diag_part`
+*   `tf.trace`
+*   `tf.transpose`
+*   `tf.eye`
+*   `tf.matrix_diag`
+*   `tf.matrix_diag_part`
+*   `tf.matrix_band_part`
+*   `tf.matrix_set_diag`
+*   `tf.matrix_transpose`
+*   `tf.matmul`
+*   `tf.norm`
+*   `tf.matrix_determinant`
+*   `tf.matrix_inverse`
+*   `tf.cholesky`
+*   `tf.cholesky_solve`
+*   `tf.matrix_solve`
+*   `tf.matrix_triangular_solve`
+*   `tf.matrix_solve_ls`
+*   `tf.qr`
+*   `tf.self_adjoint_eig`
+*   `tf.self_adjoint_eigvals`
+*   `tf.svd`
 
 
 ## Tensor Math Function
@@ -111,7 +111,7 @@
 TensorFlow provides operations that you can use to add tensor functions to your
 graph.
 
-*   @{tf.tensordot}
+*   `tf.tensordot`
 
 
 ## Complex Number Functions
@@ -119,11 +119,11 @@
 TensorFlow provides several operations that you can use to add complex number
 functions to your graph.
 
-*   @{tf.complex}
-*   @{tf.conj}
-*   @{tf.imag}
-*   @{tf.angle}
-*   @{tf.real}
+*   `tf.complex`
+*   `tf.conj`
+*   `tf.imag`
+*   `tf.angle`
+*   `tf.real`
 
 
 ## Reduction
@@ -131,25 +131,25 @@
 TensorFlow provides several operations that you can use to perform
 common math computations that reduce various dimensions of a tensor.
 
-*   @{tf.reduce_sum}
-*   @{tf.reduce_prod}
-*   @{tf.reduce_min}
-*   @{tf.reduce_max}
-*   @{tf.reduce_mean}
-*   @{tf.reduce_all}
-*   @{tf.reduce_any}
-*   @{tf.reduce_logsumexp}
-*   @{tf.count_nonzero}
-*   @{tf.accumulate_n}
-*   @{tf.einsum}
+*   `tf.reduce_sum`
+*   `tf.reduce_prod`
+*   `tf.reduce_min`
+*   `tf.reduce_max`
+*   `tf.reduce_mean`
+*   `tf.reduce_all`
+*   `tf.reduce_any`
+*   `tf.reduce_logsumexp`
+*   `tf.count_nonzero`
+*   `tf.accumulate_n`
+*   `tf.einsum`
 
 ## Scan
 
 TensorFlow provides several operations that you can use to perform scans
 (running totals) across one axis of a tensor.
 
-*   @{tf.cumsum}
-*   @{tf.cumprod}
+*   `tf.cumsum`
+*   `tf.cumprod`
 
 ## Segmentation
 
@@ -172,15 +172,15 @@
         [5 6 7 8]]
 ```
 
-*   @{tf.segment_sum}
-*   @{tf.segment_prod}
-*   @{tf.segment_min}
-*   @{tf.segment_max}
-*   @{tf.segment_mean}
-*   @{tf.unsorted_segment_sum}
-*   @{tf.sparse_segment_sum}
-*   @{tf.sparse_segment_mean}
-*   @{tf.sparse_segment_sqrt_n}
+*   `tf.segment_sum`
+*   `tf.segment_prod`
+*   `tf.segment_min`
+*   `tf.segment_max`
+*   `tf.segment_mean`
+*   `tf.unsorted_segment_sum`
+*   `tf.sparse_segment_sum`
+*   `tf.sparse_segment_mean`
+*   `tf.sparse_segment_sqrt_n`
 
 
 ## Sequence Comparison and Indexing
@@ -190,10 +190,10 @@
 determine sequence differences and determine the indexes of specific values in
 a tensor.
 
-*   @{tf.argmin}
-*   @{tf.argmax}
-*   @{tf.setdiff1d}
-*   @{tf.where}
-*   @{tf.unique}
-*   @{tf.edit_distance}
-*   @{tf.invert_permutation}
+*   `tf.argmin`
+*   `tf.argmax`
+*   `tf.setdiff1d`
+*   `tf.where`
+*   `tf.unique`
+*   `tf.edit_distance`
+*   `tf.invert_permutation`
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index f1c3adc..5e8a8b4 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -7,10 +7,10 @@
 to continue training, perform evaluation, or run inference on a previously trained graph.
 
 The APIs for exporting and importing the complete model are in
-the @{tf.train.Saver} class:
-@{tf.train.export_meta_graph}
+the `tf.train.Saver` class:
+`tf.train.export_meta_graph`
 and
-@{tf.train.import_meta_graph}.
+`tf.train.import_meta_graph`.
 
 ## What's in a MetaGraph
 
@@ -23,8 +23,8 @@
 * [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
 * [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
 map that further describes additional components of the model such as
-@{$python/state_ops$`Variables`},
-@{tf.train.QueueRunner}, etc.
+[`Variables`](../../api_guides/python/state_ops.md),
+`tf.train.QueueRunner`, etc.
 
 In order for a Python object to be serialized
 to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
@@ -122,7 +122,7 @@
 
 
 The MetaGraph is also automatically exported via the `save()` API in
-@{tf.train.Saver}.
+`tf.train.Saver`.
 
 
 ## Import a MetaGraph
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 8d8daaa..40dda39 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -1,7 +1,7 @@
 # Neural Network
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -16,17 +16,17 @@
 All activation ops apply componentwise, and produce a tensor of the same
 shape as the input tensor.
 
-*   @{tf.nn.relu}
-*   @{tf.nn.relu6}
-*   @{tf.nn.crelu}
-*   @{tf.nn.elu}
-*   @{tf.nn.selu}
-*   @{tf.nn.softplus}
-*   @{tf.nn.softsign}
-*   @{tf.nn.dropout}
-*   @{tf.nn.bias_add}
-*   @{tf.sigmoid}
-*   @{tf.tanh}
+*   `tf.nn.relu`
+*   `tf.nn.relu6`
+*   `tf.nn.crelu`
+*   `tf.nn.elu`
+*   `tf.nn.selu`
+*   `tf.nn.softplus`
+*   `tf.nn.softsign`
+*   `tf.nn.dropout`
+*   `tf.nn.bias_add`
+*   `tf.sigmoid`
+*   `tf.tanh`
 
 ## Convolution
 
@@ -112,22 +112,22 @@
 is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
 concatenated.
 
-*   @{tf.nn.convolution}
-*   @{tf.nn.conv2d}
-*   @{tf.nn.depthwise_conv2d}
-*   @{tf.nn.depthwise_conv2d_native}
-*   @{tf.nn.separable_conv2d}
-*   @{tf.nn.atrous_conv2d}
-*   @{tf.nn.atrous_conv2d_transpose}
-*   @{tf.nn.conv2d_transpose}
-*   @{tf.nn.conv1d}
-*   @{tf.nn.conv3d}
-*   @{tf.nn.conv3d_transpose}
-*   @{tf.nn.conv2d_backprop_filter}
-*   @{tf.nn.conv2d_backprop_input}
-*   @{tf.nn.conv3d_backprop_filter_v2}
-*   @{tf.nn.depthwise_conv2d_native_backprop_filter}
-*   @{tf.nn.depthwise_conv2d_native_backprop_input}
+*   `tf.nn.convolution`
+*   `tf.nn.conv2d`
+*   `tf.nn.depthwise_conv2d`
+*   `tf.nn.depthwise_conv2d_native`
+*   `tf.nn.separable_conv2d`
+*   `tf.nn.atrous_conv2d`
+*   `tf.nn.atrous_conv2d_transpose`
+*   `tf.nn.conv2d_transpose`
+*   `tf.nn.conv1d`
+*   `tf.nn.conv3d`
+*   `tf.nn.conv3d_transpose`
+*   `tf.nn.conv2d_backprop_filter`
+*   `tf.nn.conv2d_backprop_input`
+*   `tf.nn.conv3d_backprop_filter_v2`
+*   `tf.nn.depthwise_conv2d_native_backprop_filter`
+*   `tf.nn.depthwise_conv2d_native_backprop_input`
 
 ## Pooling
 
@@ -144,14 +144,14 @@
 where the indices also take into consideration the padding values. Please refer
 to the `Convolution` section for details about the padding calculation.
 
-*   @{tf.nn.avg_pool}
-*   @{tf.nn.max_pool}
-*   @{tf.nn.max_pool_with_argmax}
-*   @{tf.nn.avg_pool3d}
-*   @{tf.nn.max_pool3d}
-*   @{tf.nn.fractional_avg_pool}
-*   @{tf.nn.fractional_max_pool}
-*   @{tf.nn.pool}
+*   `tf.nn.avg_pool`
+*   `tf.nn.max_pool`
+*   `tf.nn.max_pool_with_argmax`
+*   `tf.nn.avg_pool3d`
+*   `tf.nn.max_pool3d`
+*   `tf.nn.fractional_avg_pool`
+*   `tf.nn.fractional_max_pool`
+*   `tf.nn.pool`
 
 ## Morphological filtering
 
@@ -190,24 +190,24 @@
 Striding and padding is carried out in exactly the same way as in standard
 convolution. Please refer to the `Convolution` section for details.
 
-*   @{tf.nn.dilation2d}
-*   @{tf.nn.erosion2d}
-*   @{tf.nn.with_space_to_batch}
+*   `tf.nn.dilation2d`
+*   `tf.nn.erosion2d`
+*   `tf.nn.with_space_to_batch`
 
 ## Normalization
 
 Normalization is useful to prevent neurons from saturating when inputs may
 have varying scale, and to aid generalization.
 
-*   @{tf.nn.l2_normalize}
-*   @{tf.nn.local_response_normalization}
-*   @{tf.nn.sufficient_statistics}
-*   @{tf.nn.normalize_moments}
-*   @{tf.nn.moments}
-*   @{tf.nn.weighted_moments}
-*   @{tf.nn.fused_batch_norm}
-*   @{tf.nn.batch_normalization}
-*   @{tf.nn.batch_norm_with_global_normalization}
+*   `tf.nn.l2_normalize`
+*   `tf.nn.local_response_normalization`
+*   `tf.nn.sufficient_statistics`
+*   `tf.nn.normalize_moments`
+*   `tf.nn.moments`
+*   `tf.nn.weighted_moments`
+*   `tf.nn.fused_batch_norm`
+*   `tf.nn.batch_normalization`
+*   `tf.nn.batch_norm_with_global_normalization`
 
 ## Losses
 
@@ -215,29 +215,29 @@
 These can be used for measuring accuracy of a network in a regression task
 or for regularization purposes (weight decay).
 
-*   @{tf.nn.l2_loss}
-*   @{tf.nn.log_poisson_loss}
+*   `tf.nn.l2_loss`
+*   `tf.nn.log_poisson_loss`
 
 ## Classification
 
 TensorFlow provides several operations that help you perform classification.
 
-*   @{tf.nn.sigmoid_cross_entropy_with_logits}
-*   @{tf.nn.softmax}
-*   @{tf.nn.log_softmax}
-*   @{tf.nn.softmax_cross_entropy_with_logits}
-*   @{tf.nn.softmax_cross_entropy_with_logits_v2} - identical to the base
+*   `tf.nn.sigmoid_cross_entropy_with_logits`
+*   `tf.nn.softmax`
+*   `tf.nn.log_softmax`
+*   `tf.nn.softmax_cross_entropy_with_logits`
+*   `tf.nn.softmax_cross_entropy_with_logits_v2` - identical to the base
     version, except it allows gradient propagation into the labels.
-*   @{tf.nn.sparse_softmax_cross_entropy_with_logits}
-*   @{tf.nn.weighted_cross_entropy_with_logits}
+*   `tf.nn.sparse_softmax_cross_entropy_with_logits`
+*   `tf.nn.weighted_cross_entropy_with_logits`
 
 ## Embeddings
 
 TensorFlow provides library support for looking up values in embedding
 tensors.
 
-*   @{tf.nn.embedding_lookup}
-*   @{tf.nn.embedding_lookup_sparse}
+*   `tf.nn.embedding_lookup`
+*   `tf.nn.embedding_lookup_sparse`
 
 ## Recurrent Neural Networks
 
@@ -245,23 +245,23 @@
 Neural Networks.  Most accept an `RNNCell`-subclassed object
 (see the documentation for `tf.contrib.rnn`).
 
-*   @{tf.nn.dynamic_rnn}
-*   @{tf.nn.bidirectional_dynamic_rnn}
-*   @{tf.nn.raw_rnn}
+*   `tf.nn.dynamic_rnn`
+*   `tf.nn.bidirectional_dynamic_rnn`
+*   `tf.nn.raw_rnn`
 
 ## Connectionist Temporal Classification (CTC)
 
-*   @{tf.nn.ctc_loss}
-*   @{tf.nn.ctc_greedy_decoder}
-*   @{tf.nn.ctc_beam_search_decoder}
+*   `tf.nn.ctc_loss`
+*   `tf.nn.ctc_greedy_decoder`
+*   `tf.nn.ctc_beam_search_decoder`
 
 ## Evaluation
 
 The evaluation ops are useful for measuring the performance of a network.
 They are typically used at evaluation time.
 
-*   @{tf.nn.top_k}
-*   @{tf.nn.in_top_k}
+*   `tf.nn.top_k`
+*   `tf.nn.in_top_k`
 
 ## Candidate Sampling
 
@@ -281,29 +281,29 @@
 
 TensorFlow provides the following sampled loss functions for faster training.
 
-*   @{tf.nn.nce_loss}
-*   @{tf.nn.sampled_softmax_loss}
+*   `tf.nn.nce_loss`
+*   `tf.nn.sampled_softmax_loss`
 
 ### Candidate Samplers
 
 TensorFlow provides the following samplers for randomly sampling candidate
 classes when using one of the sampled loss functions above.
 
-*   @{tf.nn.uniform_candidate_sampler}
-*   @{tf.nn.log_uniform_candidate_sampler}
-*   @{tf.nn.learned_unigram_candidate_sampler}
-*   @{tf.nn.fixed_unigram_candidate_sampler}
+*   `tf.nn.uniform_candidate_sampler`
+*   `tf.nn.log_uniform_candidate_sampler`
+*   `tf.nn.learned_unigram_candidate_sampler`
+*   `tf.nn.fixed_unigram_candidate_sampler`
 
 ### Miscellaneous candidate sampling utilities
 
-*   @{tf.nn.compute_accidental_hits}
+*   `tf.nn.compute_accidental_hits`
 
 ### Quantization ops
 
-*   @{tf.nn.quantized_conv2d}
-*   @{tf.nn.quantized_relu_x}
-*   @{tf.nn.quantized_max_pool}
-*   @{tf.nn.quantized_avg_pool}
+*   `tf.nn.quantized_conv2d`
+*   `tf.nn.quantized_relu_x`
+*   `tf.nn.quantized_max_pool`
+*   `tf.nn.quantized_avg_pool`
 
 ## Notes on SAME Convolution Padding
 
diff --git a/tensorflow/docs_src/api_guides/python/python_io.md b/tensorflow/docs_src/api_guides/python/python_io.md
index 06282e4..e7e82a8 100644
--- a/tensorflow/docs_src/api_guides/python/python_io.md
+++ b/tensorflow/docs_src/api_guides/python/python_io.md
@@ -5,10 +5,10 @@
 random access, so it is suitable for streaming large amounts of data but not
 suitable if fast sharding or other non-sequential access is desired.
 
-*   @{tf.python_io.TFRecordWriter}
-*   @{tf.python_io.tf_record_iterator}
-*   @{tf.python_io.TFRecordCompressionType}
-*   @{tf.python_io.TFRecordOptions}
+*   `tf.python_io.TFRecordWriter`
+*   `tf.python_io.tf_record_iterator`
+*   `tf.python_io.TFRecordCompressionType`
+*   `tf.python_io.TFRecordOptions`
 
 - - -
 
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index d7d0904..9f555ee 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -1,7 +1,7 @@
 # Reading data
 
 Note: The preferred way to feed data into a tensorflow program is using the
-@{$datasets$`tf.data` API}.
+[`tf.data` API](../../guide/datasets.md).
 
 There are four methods of getting data into a TensorFlow program:
 
@@ -16,7 +16,7 @@
 
 ## `tf.data` API
 
-See the @{$guide/datasets} for an in-depth explanation of @{tf.data.Dataset}.
+See the [Importing Data](../../guide/datasets.md) for an in-depth explanation of `tf.data.Dataset`.
 The `tf.data` API enables you to extract and preprocess data
 from different input/file formats, and apply transformations such as batching,
 shuffling, and mapping functions over the dataset. This is an improved version
@@ -44,7 +44,7 @@
 
 While you can replace any Tensor with feed data, including variables and
 constants, the best practice is to use a
-@{tf.placeholder} node. A
+`tf.placeholder` node. A
 `placeholder` exists solely to serve as the target of feeds. It is not
 initialized and contains no data. A placeholder generates an error if
 it is executed without a feed, so you won't forget to feed it.
@@ -56,8 +56,8 @@
 ## `QueueRunner`
 
 Warning: This section discusses implementing input pipelines using the
-queue-based APIs which can be cleanly replaced by the @{$datasets$`tf.data`
-API}.
+queue-based APIs which can be cleanly replaced by the [`tf.data`
+API](../../guide/datasets.md).
 
 A typical queue-based pipeline for reading records from files has the following stages:
 
@@ -74,9 +74,9 @@
 
 For the list of filenames, use either a constant string Tensor (like
 `["file0", "file1"]` or `[("file%d" % i) for i in range(2)]`) or the
-@{tf.train.match_filenames_once} function.
+`tf.train.match_filenames_once` function.
 
-Pass the list of filenames to the @{tf.train.string_input_producer} function.
+Pass the list of filenames to the `tf.train.string_input_producer` function.
 `string_input_producer` creates a FIFO queue for holding the filenames until
 the reader needs them.
 
@@ -102,8 +102,8 @@
 
 To read text files in [comma-separated value (CSV)
 format](https://tools.ietf.org/html/rfc4180), use a
-@{tf.TextLineReader} with the
-@{tf.decode_csv} operation. For example:
+`tf.TextLineReader` with the
+`tf.decode_csv` operation. For example:
 
 ```python
 filename_queue = tf.train.string_input_producer(["file0.csv", "file1.csv"])
@@ -143,8 +143,8 @@
 #### Fixed length records
 
 To read binary files in which each record is a fixed number of bytes, use
-@{tf.FixedLengthRecordReader}
-with the @{tf.decode_raw} operation.
+`tf.FixedLengthRecordReader`
+with the `tf.decode_raw` operation.
 The `decode_raw` op converts from a string to a uint8 tensor.
 
 For example, [the CIFAR-10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html)
@@ -154,14 +154,14 @@
 needed. For CIFAR-10, you can see how to do the reading and decoding in
 [`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 and described in
-@{$deep_cnn#prepare-the-data$this tutorial}.
+[this tutorial](../../tutorials/images/deep_cnn.md#prepare-the-data).
 
 #### Standard TensorFlow format
 
 Another approach is to convert whatever data you have into a supported format.
 This approach makes it easier to mix and match data sets and network
 architectures. The recommended format for TensorFlow is a
-@{$python/python_io#tfrecords_format_details$TFRecords file}
+[TFRecords file](../../api_guides/python/python_io.md#tfrecords_format_details)
 containing
 [`tf.train.Example` protocol buffers](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 (which contain
@@ -169,12 +169,12 @@
 as a field).  You write a little program that gets your data, stuffs it in an
 `Example` protocol buffer, serializes the protocol buffer to a string, and then
 writes the string to a TFRecords file using the
-@{tf.python_io.TFRecordWriter}.
+`tf.python_io.TFRecordWriter`.
 For example,
 [`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
 converts MNIST data to this format.
 
-The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
+The recommended way to read a TFRecord file is with a `tf.data.TFRecordDataset`, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
 
 ``` python
     dataset = tf.data.TFRecordDataset(filename)
@@ -208,7 +208,7 @@
 At the end of the pipeline we use another queue to batch together examples for
 training, evaluation, or inference.  For this we use a queue that randomizes the
 order of examples, using the
-@{tf.train.shuffle_batch}.
+`tf.train.shuffle_batch`.
 
 Example:
 
@@ -240,7 +240,7 @@
 
 If you need more parallelism or shuffling of examples between files, use
 multiple reader instances using the
-@{tf.train.shuffle_batch_join}.
+`tf.train.shuffle_batch_join`.
 For example:
 
 ```
@@ -266,7 +266,7 @@
 sufficient to have a single thread filling the filename queue.)
 
 An alternative is to use a single reader via the
-@{tf.train.shuffle_batch}
+`tf.train.shuffle_batch`
 with `num_threads` bigger than 1.  This will make it read from a single file at
 the same time (but faster than with 1 thread), instead of N files at once.
 This can be important:
@@ -279,18 +279,18 @@
 How many threads do you need? the `tf.train.shuffle_batch*` functions add a
 summary to the graph that indicates how full the example queue is. If you have
 enough reading threads, that summary will stay above zero.  You can
-@{$summaries_and_tensorboard$view your summaries as training progresses using TensorBoard}.
+[view your summaries as training progresses using TensorBoard](../../guide/summaries_and_tensorboard.md).
 
 ### Creating threads to prefetch using `QueueRunner` objects
 
 The short version: many of the `tf.train` functions listed above add
-@{tf.train.QueueRunner} objects to your
+`tf.train.QueueRunner` objects to your
 graph.  These require that you call
-@{tf.train.start_queue_runners}
+`tf.train.start_queue_runners`
 before running any training or inference steps, or it will hang forever. This
 will start threads that run the input pipeline, filling the example queue so
 that the dequeue to get the examples will succeed.  This is best combined with a
-@{tf.train.Coordinator} to cleanly
+`tf.train.Coordinator` to cleanly
 shut down these threads when there are errors. If you set a limit on the number
 of epochs, that will use an epoch counter that will need to be initialized. The
 recommended code pattern combining these is:
@@ -343,32 +343,32 @@
 </div>
 
 The helpers in `tf.train` that create these queues and enqueuing operations add
-a @{tf.train.QueueRunner} to the
+a `tf.train.QueueRunner` to the
 graph using the
-@{tf.train.add_queue_runner}
+`tf.train.add_queue_runner`
 function. Each `QueueRunner` is responsible for one stage, and holds the list of
 enqueue operations that need to be run in threads. Once the graph is
 constructed, the
-@{tf.train.start_queue_runners}
+`tf.train.start_queue_runners`
 function asks each QueueRunner in the graph to start its threads running the
 enqueuing operations.
 
 If all goes well, you can now run your training steps and the queues will be
 filled by the background threads. If you have set an epoch limit, at some point
 an attempt to dequeue examples will get an
-@{tf.errors.OutOfRangeError}. This
+`tf.errors.OutOfRangeError`. This
 is the TensorFlow equivalent of "end of file" (EOF) -- this means the epoch
 limit has been reached and no more examples are available.
 
 The last ingredient is the
-@{tf.train.Coordinator}. This is responsible
+`tf.train.Coordinator`. This is responsible
 for letting all the threads know if anything has signaled a shut down. Most
 commonly this would be because an exception was raised, for example one of the
 threads got an error when running some operation (or an ordinary Python
 exception).
 
 For more about threading, queues, QueueRunners, and Coordinators
-@{$threading_and_queues$see here}.
+[see here](../../api_guides/python/threading_and_queues.md).
 
 #### Aside: How clean shut-down when limiting epochs works
 
@@ -396,21 +396,21 @@
 QueueRunner, the `OutOfRange` error just causes the one thread to exit.  This
 allows the other threads, which are still finishing up their last file, to
 proceed until they finish as well.  (Assuming you are using a
-@{tf.train.Coordinator},
+`tf.train.Coordinator`,
 other types of errors will cause all the threads to stop.)  Once all the reader
 threads hit the `OutOfRange` error, only then does the next queue, the example
 queue, gets closed.
 
 Again, the example queue will have some elements queued, so training will
 continue until those are exhausted.  If the example queue is a
-@{tf.RandomShuffleQueue}, say
+`tf.RandomShuffleQueue`, say
 because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
 avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
 However, once the queue is closed that restriction will be lifted and the queue
 will eventually empty.  At that point the actual training threads, when they
 try and dequeue from example queue, will start getting `OutOfRange` errors and
 exiting.  Once all the training threads are done,
-@{tf.train.Coordinator.join}
+`tf.train.Coordinator.join`
 will return and you can exit cleanly.
 
 ### Filtering records or producing multiple examples per record
@@ -426,7 +426,7 @@
 
 SparseTensors don't play well with queues. If you use SparseTensors you have
 to decode the string records using
-@{tf.parse_example} **after**
+`tf.parse_example` **after**
 batching (instead of using `tf.parse_single_example` before batching).
 
 ## Preloaded data
@@ -475,11 +475,11 @@
 `GraphKeys.GLOBAL_VARIABLES` collection used for saving and restoring checkpoints.
 
 Either way,
-@{tf.train.slice_input_producer}
+`tf.train.slice_input_producer`
 can be used to produce a slice at a time.  This shuffles the examples across an
 entire epoch, so further shuffling when batching is undesirable.  So instead of
 using the `shuffle_batch` functions, we use the plain
-@{tf.train.batch} function.  To use
+`tf.train.batch` function.  To use
 multiple preprocessing threads, set the `num_threads` parameter to a number
 bigger than 1.
 
@@ -500,23 +500,23 @@
 * The evaluation process restores the checkpoint files into an inference
   model that reads validation input data.
 
-This is what is done @{tf.estimator$estimators} and manually in
-@{$deep_cnn#save-and-restore-checkpoints$the example CIFAR-10 model}.
+This is what is done `tf.estimator` and manually in
+[the example CIFAR-10 model](../../tutorials/images/deep_cnn.md#save-and-restore-checkpoints).
 This has a couple of benefits:
 
 * The eval is performed on a single snapshot of the trained variables.
 * You can perform the eval even after training has completed and exited.
 
 You can have the train and eval in the same graph in the same process, and share
-their trained variables or layers. See @{$variables$the shared variables tutorial}.
+their trained variables or layers. See [the shared variables tutorial](../../guide/variables.md).
 
 To support the single-graph approach
-@{$guide/datasets$`tf.data`} also supplies
-@{$guide/datasets#creating_an_iterator$advanced iterator types} that
+[`tf.data`](../../guide/datasets.md) also supplies
+[advanced iterator types](../../guide/datasets.md#creating_an_iterator) that
 that allow the user to change the input pipeline without rebuilding the graph or
 session.
 
 Note: Regardless of the implementation, many
-operations (like @{tf.layers.batch_normalization}, and @{tf.layers.dropout})
+operations (like `tf.layers.batch_normalization`, and `tf.layers.dropout`)
 need to know if they are in training or evaluation mode, and you must be
 careful to set this appropriately if you change the data source.
diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index 7de2be0..d67f38f 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -8,25 +8,25 @@
 
   <tr>
     <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/linear_regression.py">linear_regression.py</a></td>
-    <td>Use the @{tf.estimator.LinearRegressor} Estimator to train a
+    <td>Use the `tf.estimator.LinearRegressor` Estimator to train a
         regression model on numeric data.</td>
   </tr>
 
   <tr>
     <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/linear_regression_categorical.py">linear_regression_categorical.py</a></td>
-    <td>Use the @{tf.estimator.LinearRegressor} Estimator to train a
+    <td>Use the `tf.estimator.LinearRegressor` Estimator to train a
         regression model on categorical data.</td>
   </tr>
 
   <tr>
     <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/dnn_regression.py">dnn_regression.py</a></td>
-    <td>Use the @{tf.estimator.DNNRegressor} Estimator to train a
+    <td>Use the `tf.estimator.DNNRegressor` Estimator to train a
         regression model on discrete data with a deep neural network.</td>
   </tr>
 
   <tr>
     <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/custom_regression.py">custom_regression.py</a></td>
-    <td>Use @{tf.estimator.Estimator} to train a customized dnn
+    <td>Use `tf.estimator.Estimator` to train a customized dnn
         regression model.</td>
   </tr>
 
@@ -66,7 +66,7 @@
 <a name="running"></a>
 ## Running the examples
 
-You must @{$install$install TensorFlow} prior to running these examples.
+You must [install TensorFlow](../../install/index.md) prior to running these examples.
 Depending on the way you've installed TensorFlow, you might also
 need to activate your TensorFlow environment.  Then, do the following:
 
@@ -219,7 +219,7 @@
 of a car based on mixed real-valued and categorical input features, described by
 feature_columns. Unlike `linear_regression_categorical.py`, and
 `dnn_regression.py` this example does not use a pre-made estimator, but defines
-a custom model using the base @{tf.estimator.Estimator$`Estimator`} class. The
+a custom model using the base `tf.estimator.Estimator` class. The
 custom model is quite similar to the model defined by `dnn_regression.py`.
 
 The custom model is defined by the `model_fn` argument to the constructor. The
@@ -227,6 +227,6 @@
 passed through to the `model_fn` when the `model_fn` is called.
 
 The `model_fn` returns an
-@{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
+`tf.estimator.EstimatorSpec` which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
 various tasks.
diff --git a/tensorflow/docs_src/api_guides/python/session_ops.md b/tensorflow/docs_src/api_guides/python/session_ops.md
index 5176e35..5f41bcf 100644
--- a/tensorflow/docs_src/api_guides/python/session_ops.md
+++ b/tensorflow/docs_src/api_guides/python/session_ops.md
@@ -1,7 +1,7 @@
 # Tensor Handle Operations
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -10,6 +10,6 @@
 TensorFlow provides several operators that allows the user to keep tensors
 "in-place" across run calls.
 
-*   @{tf.get_session_handle}
-*   @{tf.get_session_tensor}
-*   @{tf.delete_session_tensor}
+*   `tf.get_session_handle`
+*   `tf.get_session_tensor`
+*   `tf.delete_session_tensor`
diff --git a/tensorflow/docs_src/api_guides/python/sparse_ops.md b/tensorflow/docs_src/api_guides/python/sparse_ops.md
index 19d5fab..b360055 100644
--- a/tensorflow/docs_src/api_guides/python/sparse_ops.md
+++ b/tensorflow/docs_src/api_guides/python/sparse_ops.md
@@ -1,7 +1,7 @@
 # Sparse Tensors
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -12,34 +12,34 @@
 which is efficient for representing tensors that are sparse in their first
 dimension, and dense along all other dimensions.
 
-*   @{tf.SparseTensor}
-*   @{tf.SparseTensorValue}
+*   `tf.SparseTensor`
+*   `tf.SparseTensorValue`
 
 ## Conversion
 
-*   @{tf.sparse_to_dense}
-*   @{tf.sparse_tensor_to_dense}
-*   @{tf.sparse_to_indicator}
-*   @{tf.sparse_merge}
+*   `tf.sparse_to_dense`
+*   `tf.sparse_tensor_to_dense`
+*   `tf.sparse_to_indicator`
+*   `tf.sparse_merge`
 
 ## Manipulation
 
-*   @{tf.sparse_concat}
-*   @{tf.sparse_reorder}
-*   @{tf.sparse_reshape}
-*   @{tf.sparse_split}
-*   @{tf.sparse_retain}
-*   @{tf.sparse_reset_shape}
-*   @{tf.sparse_fill_empty_rows}
-*   @{tf.sparse_transpose}
+*   `tf.sparse_concat`
+*   `tf.sparse_reorder`
+*   `tf.sparse_reshape`
+*   `tf.sparse_split`
+*   `tf.sparse_retain`
+*   `tf.sparse_reset_shape`
+*   `tf.sparse_fill_empty_rows`
+*   `tf.sparse_transpose`
 
 ## Reduction
-*   @{tf.sparse_reduce_sum}
-*   @{tf.sparse_reduce_sum_sparse}
+*   `tf.sparse_reduce_sum`
+*   `tf.sparse_reduce_sum_sparse`
 
 ## Math Operations
-*   @{tf.sparse_add}
-*   @{tf.sparse_softmax}
-*   @{tf.sparse_tensor_dense_matmul}
-*   @{tf.sparse_maximum}
-*   @{tf.sparse_minimum}
+*   `tf.sparse_add`
+*   `tf.sparse_softmax`
+*   `tf.sparse_tensor_dense_matmul`
+*   `tf.sparse_maximum`
+*   `tf.sparse_minimum`
diff --git a/tensorflow/docs_src/api_guides/python/spectral_ops.md b/tensorflow/docs_src/api_guides/python/spectral_ops.md
index dd13802..f6d109a 100644
--- a/tensorflow/docs_src/api_guides/python/spectral_ops.md
+++ b/tensorflow/docs_src/api_guides/python/spectral_ops.md
@@ -2,25 +2,25 @@
 
 [TOC]
 
-The @{tf.spectral} module supports several spectral decomposition operations
+The `tf.spectral` module supports several spectral decomposition operations
 that you can use to transform Tensors of real and complex signals.
 
 ## Discrete Fourier Transforms
 
-*   @{tf.spectral.fft}
-*   @{tf.spectral.ifft}
-*   @{tf.spectral.fft2d}
-*   @{tf.spectral.ifft2d}
-*   @{tf.spectral.fft3d}
-*   @{tf.spectral.ifft3d}
-*   @{tf.spectral.rfft}
-*   @{tf.spectral.irfft}
-*   @{tf.spectral.rfft2d}
-*   @{tf.spectral.irfft2d}
-*   @{tf.spectral.rfft3d}
-*   @{tf.spectral.irfft3d}
+*   `tf.spectral.fft`
+*   `tf.spectral.ifft`
+*   `tf.spectral.fft2d`
+*   `tf.spectral.ifft2d`
+*   `tf.spectral.fft3d`
+*   `tf.spectral.ifft3d`
+*   `tf.spectral.rfft`
+*   `tf.spectral.irfft`
+*   `tf.spectral.rfft2d`
+*   `tf.spectral.irfft2d`
+*   `tf.spectral.rfft3d`
+*   `tf.spectral.irfft3d`
 
 ## Discrete Cosine Transforms
 
-*   @{tf.spectral.dct}
-*   @{tf.spectral.idct}
+*   `tf.spectral.dct`
+*   `tf.spectral.idct`
diff --git a/tensorflow/docs_src/api_guides/python/state_ops.md b/tensorflow/docs_src/api_guides/python/state_ops.md
index ec2d877..fc55ea1 100644
--- a/tensorflow/docs_src/api_guides/python/state_ops.md
+++ b/tensorflow/docs_src/api_guides/python/state_ops.md
@@ -1,68 +1,68 @@
 # Variables
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
 ## Variables
 
-*   @{tf.Variable}
+*   `tf.Variable`
 
 ## Variable helper functions
 
 TensorFlow provides a set of functions to help manage the set of variables
 collected in the graph.
 
-*   @{tf.global_variables}
-*   @{tf.local_variables}
-*   @{tf.model_variables}
-*   @{tf.trainable_variables}
-*   @{tf.moving_average_variables}
-*   @{tf.global_variables_initializer}
-*   @{tf.local_variables_initializer}
-*   @{tf.variables_initializer}
-*   @{tf.is_variable_initialized}
-*   @{tf.report_uninitialized_variables}
-*   @{tf.assert_variables_initialized}
-*   @{tf.assign}
-*   @{tf.assign_add}
-*   @{tf.assign_sub}
+*   `tf.global_variables`
+*   `tf.local_variables`
+*   `tf.model_variables`
+*   `tf.trainable_variables`
+*   `tf.moving_average_variables`
+*   `tf.global_variables_initializer`
+*   `tf.local_variables_initializer`
+*   `tf.variables_initializer`
+*   `tf.is_variable_initialized`
+*   `tf.report_uninitialized_variables`
+*   `tf.assert_variables_initialized`
+*   `tf.assign`
+*   `tf.assign_add`
+*   `tf.assign_sub`
 
 ## Saving and Restoring Variables
 
-*   @{tf.train.Saver}
-*   @{tf.train.latest_checkpoint}
-*   @{tf.train.get_checkpoint_state}
-*   @{tf.train.update_checkpoint_state}
+*   `tf.train.Saver`
+*   `tf.train.latest_checkpoint`
+*   `tf.train.get_checkpoint_state`
+*   `tf.train.update_checkpoint_state`
 
 ## Sharing Variables
 
 TensorFlow provides several classes and operations that you can use to
 create variables contingent on certain conditions.
 
-*   @{tf.get_variable}
-*   @{tf.get_local_variable}
-*   @{tf.VariableScope}
-*   @{tf.variable_scope}
-*   @{tf.variable_op_scope}
-*   @{tf.get_variable_scope}
-*   @{tf.make_template}
-*   @{tf.no_regularizer}
-*   @{tf.constant_initializer}
-*   @{tf.random_normal_initializer}
-*   @{tf.truncated_normal_initializer}
-*   @{tf.random_uniform_initializer}
-*   @{tf.uniform_unit_scaling_initializer}
-*   @{tf.zeros_initializer}
-*   @{tf.ones_initializer}
-*   @{tf.orthogonal_initializer}
+*   `tf.get_variable`
+*   `tf.get_local_variable`
+*   `tf.VariableScope`
+*   `tf.variable_scope`
+*   `tf.variable_op_scope`
+*   `tf.get_variable_scope`
+*   `tf.make_template`
+*   `tf.no_regularizer`
+*   `tf.constant_initializer`
+*   `tf.random_normal_initializer`
+*   `tf.truncated_normal_initializer`
+*   `tf.random_uniform_initializer`
+*   `tf.uniform_unit_scaling_initializer`
+*   `tf.zeros_initializer`
+*   `tf.ones_initializer`
+*   `tf.orthogonal_initializer`
 
 ## Variable Partitioners for Sharding
 
-*   @{tf.fixed_size_partitioner}
-*   @{tf.variable_axis_size_partitioner}
-*   @{tf.min_max_variable_partitioner}
+*   `tf.fixed_size_partitioner`
+*   `tf.variable_axis_size_partitioner`
+*   `tf.min_max_variable_partitioner`
 
 ## Sparse Variable Updates
 
@@ -73,38 +73,38 @@
 
 Since a sparse update of a large tensor may be generated automatically during
 gradient computation (as in the gradient of
-@{tf.gather}),
-an @{tf.IndexedSlices} class is provided that encapsulates a set
+`tf.gather`),
+an `tf.IndexedSlices` class is provided that encapsulates a set
 of sparse indices and values.  `IndexedSlices` objects are detected and handled
 automatically by the optimizers in most cases.
 
-*   @{tf.scatter_update}
-*   @{tf.scatter_add}
-*   @{tf.scatter_sub}
-*   @{tf.scatter_mul}
-*   @{tf.scatter_div}
-*   @{tf.scatter_min}
-*   @{tf.scatter_max}
-*   @{tf.scatter_nd_update}
-*   @{tf.scatter_nd_add}
-*   @{tf.scatter_nd_sub}
-*   @{tf.sparse_mask}
-*   @{tf.IndexedSlices}
+*   `tf.scatter_update`
+*   `tf.scatter_add`
+*   `tf.scatter_sub`
+*   `tf.scatter_mul`
+*   `tf.scatter_div`
+*   `tf.scatter_min`
+*   `tf.scatter_max`
+*   `tf.scatter_nd_update`
+*   `tf.scatter_nd_add`
+*   `tf.scatter_nd_sub`
+*   `tf.sparse_mask`
+*   `tf.IndexedSlices`
 
 ### Read-only Lookup Tables
 
-*   @{tf.initialize_all_tables}
-*   @{tf.tables_initializer}
+*   `tf.initialize_all_tables`
+*   `tf.tables_initializer`
 
 
 ## Exporting and Importing Meta Graphs
 
-*   @{tf.train.export_meta_graph}
-*   @{tf.train.import_meta_graph}
+*   `tf.train.export_meta_graph`
+*   `tf.train.import_meta_graph`
 
 # Deprecated functions (removed after 2017-03-02). Please don't use them.
 
-*   @{tf.all_variables}
-*   @{tf.initialize_all_variables}
-*   @{tf.initialize_local_variables}
-*   @{tf.initialize_variables}
+*   `tf.all_variables`
+*   `tf.initialize_all_variables`
+*   `tf.initialize_local_variables`
+*   `tf.initialize_variables`
diff --git a/tensorflow/docs_src/api_guides/python/string_ops.md b/tensorflow/docs_src/api_guides/python/string_ops.md
index e9be4f1..24a3aad 100644
--- a/tensorflow/docs_src/api_guides/python/string_ops.md
+++ b/tensorflow/docs_src/api_guides/python/string_ops.md
@@ -1,7 +1,7 @@
 # Strings
 
 Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
+`tf.convert_to_tensor`.
 
 [TOC]
 
@@ -10,30 +10,30 @@
 String hashing ops take a string input tensor and map each element to an
 integer.
 
-*   @{tf.string_to_hash_bucket_fast}
-*   @{tf.string_to_hash_bucket_strong}
-*   @{tf.string_to_hash_bucket}
+*   `tf.string_to_hash_bucket_fast`
+*   `tf.string_to_hash_bucket_strong`
+*   `tf.string_to_hash_bucket`
 
 ## Joining
 
 String joining ops concatenate elements of input string tensors to produce a new
 string tensor.
 
-*   @{tf.reduce_join}
-*   @{tf.string_join}
+*   `tf.reduce_join`
+*   `tf.string_join`
 
 ## Splitting
 
-*   @{tf.string_split}
-*   @{tf.substr}
+*   `tf.string_split`
+*   `tf.substr`
 
 ## Conversion
 
-*   @{tf.as_string}
-*   @{tf.string_to_number}
+*   `tf.as_string`
+*   `tf.string_to_number`
 
-*   @{tf.decode_raw}
-*   @{tf.decode_csv}
+*   `tf.decode_raw`
+*   `tf.decode_csv`
 
-*   @{tf.encode_base64}
-*   @{tf.decode_base64}
+*   `tf.encode_base64`
+*   `tf.decode_base64`
diff --git a/tensorflow/docs_src/api_guides/python/summary.md b/tensorflow/docs_src/api_guides/python/summary.md
index eda119a..fc45e7b 100644
--- a/tensorflow/docs_src/api_guides/python/summary.md
+++ b/tensorflow/docs_src/api_guides/python/summary.md
@@ -2,22 +2,22 @@
 [TOC]
 
 Summaries provide a way to export condensed information about a model, which is
-then accessible in tools such as @{$summaries_and_tensorboard$TensorBoard}.
+then accessible in tools such as [TensorBoard](../../guide/summaries_and_tensorboard.md).
 
 ## Generation of Summaries
 
 ### Class for writing Summaries
-*   @{tf.summary.FileWriter}
-*   @{tf.summary.FileWriterCache}
+*   `tf.summary.FileWriter`
+*   `tf.summary.FileWriterCache`
 
 ### Summary Ops
-*   @{tf.summary.tensor_summary}
-*   @{tf.summary.scalar}
-*   @{tf.summary.histogram}
-*   @{tf.summary.audio}
-*   @{tf.summary.image}
-*   @{tf.summary.merge}
-*   @{tf.summary.merge_all}
+*   `tf.summary.tensor_summary`
+*   `tf.summary.scalar`
+*   `tf.summary.histogram`
+*   `tf.summary.audio`
+*   `tf.summary.image`
+*   `tf.summary.merge`
+*   `tf.summary.merge_all`
 
 ## Utilities
-*   @{tf.summary.get_summary_description}
+*   `tf.summary.get_summary_description`
diff --git a/tensorflow/docs_src/api_guides/python/test.md b/tensorflow/docs_src/api_guides/python/test.md
index 5dc8812..b6e0a33 100644
--- a/tensorflow/docs_src/api_guides/python/test.md
+++ b/tensorflow/docs_src/api_guides/python/test.md
@@ -23,25 +23,25 @@
 ```
 
 `tf.test.TestCase` inherits from `unittest.TestCase` but adds a few additional
-methods.  See @{tf.test.TestCase} for details.
+methods.  See `tf.test.TestCase` for details.
 
-*   @{tf.test.main}
-*   @{tf.test.TestCase}
-*   @{tf.test.test_src_dir_path}
+*   `tf.test.main`
+*   `tf.test.TestCase`
+*   `tf.test.test_src_dir_path`
 
 ## Utilities
 
 Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
 depending on the python version.
 
-*   @{tf.test.assert_equal_graph_def}
-*   @{tf.test.get_temp_dir}
-*   @{tf.test.is_built_with_cuda}
-*   @{tf.test.is_gpu_available}
-*   @{tf.test.gpu_device_name}
+*   `tf.test.assert_equal_graph_def`
+*   `tf.test.get_temp_dir`
+*   `tf.test.is_built_with_cuda`
+*   `tf.test.is_gpu_available`
+*   `tf.test.gpu_device_name`
 
 ## Gradient checking
 
-@{tf.test.compute_gradient} and @{tf.test.compute_gradient_error} perform
+`tf.test.compute_gradient` and `tf.test.compute_gradient_error` perform
 numerical differentiation of graphs for comparison against registered analytic
 gradients.
diff --git a/tensorflow/docs_src/api_guides/python/tfdbg.md b/tensorflow/docs_src/api_guides/python/tfdbg.md
index 2212a2d..9778cdc 100644
--- a/tensorflow/docs_src/api_guides/python/tfdbg.md
+++ b/tensorflow/docs_src/api_guides/python/tfdbg.md
@@ -8,9 +8,9 @@
 These functions help you modify `RunOptions` to specify which `Tensor`s are to
 be watched when the TensorFlow graph is executed at runtime.
 
-*   @{tfdbg.add_debug_tensor_watch}
-*   @{tfdbg.watch_graph}
-*   @{tfdbg.watch_graph_with_blacklists}
+*   `tfdbg.add_debug_tensor_watch`
+*   `tfdbg.watch_graph`
+*   `tfdbg.watch_graph_with_blacklists`
 
 
 ## Classes for debug-dump data and directories
@@ -18,13 +18,13 @@
 These classes allow you to load and inspect tensor values dumped from
 TensorFlow graphs during runtime.
 
-*   @{tfdbg.DebugTensorDatum}
-*   @{tfdbg.DebugDumpDir}
+*   `tfdbg.DebugTensorDatum`
+*   `tfdbg.DebugDumpDir`
 
 
 ## Functions for loading debug-dump data
 
-*   @{tfdbg.load_tensor_from_event_file}
+*   `tfdbg.load_tensor_from_event_file`
 
 
 ## Tensor-value predicates
@@ -32,7 +32,7 @@
 Built-in tensor-filter predicates to support conditional breakpoint between
 runs. See `DebugDumpDir.find()` for more details.
 
-*   @{tfdbg.has_inf_or_nan}
+*   `tfdbg.has_inf_or_nan`
 
 
 ## Session wrapper class and `SessionRunHook` implementations
@@ -44,7 +44,7 @@
 * generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
   `DumpingDebugHook` and `LocalCLIDebugHook`).
 
-*   @{tfdbg.DumpingDebugHook}
-*   @{tfdbg.DumpingDebugWrapperSession}
-*   @{tfdbg.LocalCLIDebugHook}
-*   @{tfdbg.LocalCLIDebugWrapperSession}
+*   `tfdbg.DumpingDebugHook`
+*   `tfdbg.DumpingDebugWrapperSession`
+*   `tfdbg.LocalCLIDebugHook`
+*   `tfdbg.LocalCLIDebugWrapperSession`
diff --git a/tensorflow/docs_src/api_guides/python/threading_and_queues.md b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
index 8ad4c4c..e00f17f 100644
--- a/tensorflow/docs_src/api_guides/python/threading_and_queues.md
+++ b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
@@ -3,7 +3,7 @@
 Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded,
 queue-based input pipelines for performance. Beginning with TensorFlow 1.4,
 however, we recommend using the `tf.data` module instead. (See
-@{$datasets$Datasets} for details. In TensorFlow 1.2 and 1.3, the module was
+[Datasets](../../guide/datasets.md) for details. In TensorFlow 1.2 and 1.3, the module was
 called `tf.contrib.data`.) The `tf.data` module offers an easier-to-use
 interface for constructing efficient input pipelines. Furthermore, we've stopped
 developing the old multi-threaded, queue-based input pipelines.  We've retained
@@ -25,7 +25,7 @@
 TensorFlow implements several classes of queue. The principal difference between
 these classes is the order that items are removed from the queue.  To get a feel
 for queues, let's consider a simple example. We will create a "first in, first
-out" queue (@{tf.FIFOQueue}) and fill it with zeros.  Then we'll construct a
+out" queue (`tf.FIFOQueue`) and fill it with zeros.  Then we'll construct a
 graph that takes an item off the queue, adds one to that item, and puts it back
 on the end of the queue. Slowly, the numbers on the queue increase.
 
@@ -47,8 +47,8 @@
 
 ## Queue usage overview
 
-Queues, such as @{tf.FIFOQueue}
-and @{tf.RandomShuffleQueue},
+Queues, such as `tf.FIFOQueue`
+and `tf.RandomShuffleQueue`,
 are important TensorFlow objects that aid in computing tensors asynchronously
 in a graph.
 
@@ -59,11 +59,11 @@
 * A training thread executes a training op that dequeues mini-batches from the
   queue
 
-We recommend using the @{tf.data.Dataset.shuffle$`shuffle`}
-and @{tf.data.Dataset.batch$`batch`} methods of a
-@{tf.data.Dataset$`Dataset`} to accomplish this. However, if you'd prefer
+We recommend using the `tf.data.Dataset.shuffle`
+and `tf.data.Dataset.batch` methods of a
+`tf.data.Dataset` to accomplish this. However, if you'd prefer
 to use a queue-based version instead, you can find a full implementation in the
-@{tf.train.shuffle_batch} function.
+`tf.train.shuffle_batch` function.
 
 For demonstration purposes a simplified implementation is given below.
 
@@ -93,8 +93,8 @@
   return queue.dequeue_many(batch_size)
 ```
 
-Once started by @{tf.train.start_queue_runners}, or indirectly through
-@{tf.train.MonitoredSession}, the `QueueRunner` will launch the
+Once started by `tf.train.start_queue_runners`, or indirectly through
+`tf.train.MonitoredSession`, the `QueueRunner` will launch the
 threads in the background to fill the queue. Meanwhile the main thread will
 execute the `dequeue_many` op to pull data from it. Note how these ops do not
 depend on each other, except indirectly through the internal state of the queue.
@@ -126,7 +126,7 @@
 ```
 
 For most use cases, the automatic thread startup and management provided
-by @{tf.train.MonitoredSession} is sufficient. In the rare case that it is not,
+by `tf.train.MonitoredSession` is sufficient. In the rare case that it is not,
 TensorFlow provides tools for manually managing your threads and queues.
 
 ## Manual Thread Management
@@ -139,8 +139,8 @@
 reported, and queues must be properly closed when stopping.
 
 TensorFlow provides two classes to help:
-@{tf.train.Coordinator} and
-@{tf.train.QueueRunner}. These two classes
+`tf.train.Coordinator` and
+`tf.train.QueueRunner`. These two classes
 are designed to be used together. The `Coordinator` class helps multiple threads
 stop together and report exceptions to a program that waits for them to stop.
 The `QueueRunner` class is used to create a number of threads cooperating to
@@ -148,14 +148,14 @@
 
 ### Coordinator
 
-The @{tf.train.Coordinator} class manages background threads in a TensorFlow
+The `tf.train.Coordinator` class manages background threads in a TensorFlow
 program and helps multiple threads stop together.
 
 Its key methods are:
 
-* @{tf.train.Coordinator.should_stop}: returns `True` if the threads should stop.
-* @{tf.train.Coordinator.request_stop}: requests that threads should stop.
-* @{tf.train.Coordinator.join}: waits until the specified threads have stopped.
+* `tf.train.Coordinator.should_stop`: returns `True` if the threads should stop.
+* `tf.train.Coordinator.request_stop`: requests that threads should stop.
+* `tf.train.Coordinator.join`: waits until the specified threads have stopped.
 
 You first create a `Coordinator` object, and then create a number of threads
 that use the coordinator.  The threads typically run loops that stop when
@@ -191,11 +191,11 @@
 
 Obviously, the coordinator can manage threads doing very different things.
 They don't have to be all the same as in the example above.  The coordinator
-also has support to capture and report exceptions.  See the @{tf.train.Coordinator} documentation for more details.
+also has support to capture and report exceptions.  See the `tf.train.Coordinator` documentation for more details.
 
 ### QueueRunner
 
-The @{tf.train.QueueRunner} class creates a number of threads that repeatedly
+The `tf.train.QueueRunner` class creates a number of threads that repeatedly
 run an enqueue op.  These threads can use a coordinator to stop together.  In
 addition, a queue runner will run a *closer operation* that closes the queue if
 an exception is reported to the coordinator.
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
index cbc5052..4b4c6a4 100644
--- a/tensorflow/docs_src/api_guides/python/train.md
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -1,7 +1,7 @@
 # Training
 [TOC]
 
-@{tf.train} provides a set of classes and functions that help train models.
+`tf.train` provides a set of classes and functions that help train models.
 
 ## Optimizers
 
@@ -12,19 +12,19 @@
 You never instantiate the Optimizer class itself, but instead instantiate one
 of the subclasses.
 
-*   @{tf.train.Optimizer}
-*   @{tf.train.GradientDescentOptimizer}
-*   @{tf.train.AdadeltaOptimizer}
-*   @{tf.train.AdagradOptimizer}
-*   @{tf.train.AdagradDAOptimizer}
-*   @{tf.train.MomentumOptimizer}
-*   @{tf.train.AdamOptimizer}
-*   @{tf.train.FtrlOptimizer}
-*   @{tf.train.ProximalGradientDescentOptimizer}
-*   @{tf.train.ProximalAdagradOptimizer}
-*   @{tf.train.RMSPropOptimizer}
+*   `tf.train.Optimizer`
+*   `tf.train.GradientDescentOptimizer`
+*   `tf.train.AdadeltaOptimizer`
+*   `tf.train.AdagradOptimizer`
+*   `tf.train.AdagradDAOptimizer`
+*   `tf.train.MomentumOptimizer`
+*   `tf.train.AdamOptimizer`
+*   `tf.train.FtrlOptimizer`
+*   `tf.train.ProximalGradientDescentOptimizer`
+*   `tf.train.ProximalAdagradOptimizer`
+*   `tf.train.RMSPropOptimizer`
 
-See @{tf.contrib.opt} for more optimizers.
+See `tf.contrib.opt` for more optimizers.
 
 ## Gradient Computation
 
@@ -34,10 +34,10 @@
 creators of new Optimizers or expert users can call the lower-level
 functions below.
 
-*   @{tf.gradients}
-*   @{tf.AggregationMethod}
-*   @{tf.stop_gradient}
-*   @{tf.hessians}
+*   `tf.gradients`
+*   `tf.AggregationMethod`
+*   `tf.stop_gradient`
+*   `tf.hessians`
 
 
 ## Gradient Clipping
@@ -47,22 +47,22 @@
 clipping, but they're particularly useful for handling exploding or vanishing
 gradients.
 
-*   @{tf.clip_by_value}
-*   @{tf.clip_by_norm}
-*   @{tf.clip_by_average_norm}
-*   @{tf.clip_by_global_norm}
-*   @{tf.global_norm}
+*   `tf.clip_by_value`
+*   `tf.clip_by_norm`
+*   `tf.clip_by_average_norm`
+*   `tf.clip_by_global_norm`
+*   `tf.global_norm`
 
 ## Decaying the learning rate
 
-*   @{tf.train.exponential_decay}
-*   @{tf.train.inverse_time_decay}
-*   @{tf.train.natural_exp_decay}
-*   @{tf.train.piecewise_constant}
-*   @{tf.train.polynomial_decay}
-*   @{tf.train.cosine_decay}
-*   @{tf.train.linear_cosine_decay}
-*   @{tf.train.noisy_linear_cosine_decay}
+*   `tf.train.exponential_decay`
+*   `tf.train.inverse_time_decay`
+*   `tf.train.natural_exp_decay`
+*   `tf.train.piecewise_constant`
+*   `tf.train.polynomial_decay`
+*   `tf.train.cosine_decay`
+*   `tf.train.linear_cosine_decay`
+*   `tf.train.noisy_linear_cosine_decay`
 
 ## Moving Averages
 
@@ -70,70 +70,70 @@
 from maintaining a moving average of variables during optimization.  Using the
 moving averages for evaluations often improve results significantly.
 
-*   @{tf.train.ExponentialMovingAverage}
+*   `tf.train.ExponentialMovingAverage`
 
 ## Coordinator and QueueRunner
 
-See @{$threading_and_queues$Threading and Queues}
+See [Threading and Queues](../../api_guides/python/threading_and_queues.md)
 for how to use threads and queues.  For documentation on the Queue API,
-see @{$python/io_ops#queues$Queues}.
+see [Queues](../../api_guides/python/io_ops.md#queues).
 
 
-*   @{tf.train.Coordinator}
-*   @{tf.train.QueueRunner}
-*   @{tf.train.LooperThread}
-*   @{tf.train.add_queue_runner}
-*   @{tf.train.start_queue_runners}
+*   `tf.train.Coordinator`
+*   `tf.train.QueueRunner`
+*   `tf.train.LooperThread`
+*   `tf.train.add_queue_runner`
+*   `tf.train.start_queue_runners`
 
 ## Distributed execution
 
-See @{$distributed$Distributed TensorFlow} for
+See [Distributed TensorFlow](../../deploy/distributed.md) for
 more information about how to configure a distributed TensorFlow program.
 
-*   @{tf.train.Server}
-*   @{tf.train.Supervisor}
-*   @{tf.train.SessionManager}
-*   @{tf.train.ClusterSpec}
-*   @{tf.train.replica_device_setter}
-*   @{tf.train.MonitoredTrainingSession}
-*   @{tf.train.MonitoredSession}
-*   @{tf.train.SingularMonitoredSession}
-*   @{tf.train.Scaffold}
-*   @{tf.train.SessionCreator}
-*   @{tf.train.ChiefSessionCreator}
-*   @{tf.train.WorkerSessionCreator}
+*   `tf.train.Server`
+*   `tf.train.Supervisor`
+*   `tf.train.SessionManager`
+*   `tf.train.ClusterSpec`
+*   `tf.train.replica_device_setter`
+*   `tf.train.MonitoredTrainingSession`
+*   `tf.train.MonitoredSession`
+*   `tf.train.SingularMonitoredSession`
+*   `tf.train.Scaffold`
+*   `tf.train.SessionCreator`
+*   `tf.train.ChiefSessionCreator`
+*   `tf.train.WorkerSessionCreator`
 
 ## Reading Summaries from Event Files
 
-See @{$summaries_and_tensorboard$Summaries and TensorBoard} for an
+See [Summaries and TensorBoard](../../guide/summaries_and_tensorboard.md) for an
 overview of summaries, event files, and visualization in TensorBoard.
 
-*   @{tf.train.summary_iterator}
+*   `tf.train.summary_iterator`
 
 ## Training Hooks
 
 Hooks are tools that run in the process of training/evaluation of the model.
 
-*   @{tf.train.SessionRunHook}
-*   @{tf.train.SessionRunArgs}
-*   @{tf.train.SessionRunContext}
-*   @{tf.train.SessionRunValues}
-*   @{tf.train.LoggingTensorHook}
-*   @{tf.train.StopAtStepHook}
-*   @{tf.train.CheckpointSaverHook}
-*   @{tf.train.NewCheckpointReader}
-*   @{tf.train.StepCounterHook}
-*   @{tf.train.NanLossDuringTrainingError}
-*   @{tf.train.NanTensorHook}
-*   @{tf.train.SummarySaverHook}
-*   @{tf.train.GlobalStepWaiterHook}
-*   @{tf.train.FinalOpsHook}
-*   @{tf.train.FeedFnHook}
+*   `tf.train.SessionRunHook`
+*   `tf.train.SessionRunArgs`
+*   `tf.train.SessionRunContext`
+*   `tf.train.SessionRunValues`
+*   `tf.train.LoggingTensorHook`
+*   `tf.train.StopAtStepHook`
+*   `tf.train.CheckpointSaverHook`
+*   `tf.train.NewCheckpointReader`
+*   `tf.train.StepCounterHook`
+*   `tf.train.NanLossDuringTrainingError`
+*   `tf.train.NanTensorHook`
+*   `tf.train.SummarySaverHook`
+*   `tf.train.GlobalStepWaiterHook`
+*   `tf.train.FinalOpsHook`
+*   `tf.train.FeedFnHook`
 
 ## Training Utilities
 
-*   @{tf.train.global_step}
-*   @{tf.train.basic_train_loop}
-*   @{tf.train.get_global_step}
-*   @{tf.train.assert_global_step}
-*   @{tf.train.write_graph}
+*   `tf.train.global_step`
+*   `tf.train.basic_train_loop`
+*   `tf.train.get_global_step`
+*   `tf.train.assert_global_step`
+*   `tf.train.write_graph`
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
index afbb8bb..ece4a7c 100644
--- a/tensorflow/docs_src/community/contributing.md
+++ b/tensorflow/docs_src/community/contributing.md
@@ -25,12 +25,12 @@
 [developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
 mailing list, to coordinate and discuss with others contributing to TensorFlow.
 
-* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+* For coding style conventions, read the [TensorFlow Style Guide](../community/style_guide.md).
 
-* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+* Finally, review [Writing TensorFlow Documentation](../community/documentation.md), which
   explains documentation conventions.
 
-You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+You may also wish to review our guide to [defining and running benchmarks](../community/benchmarks.md).
 
 ## Special Interest Groups
 
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index eec2e51..1a30be3 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -25,10 +25,10 @@
   
 ### Security
 
-Before using TensorFlow, please take a look at our security model, list of
-recent security announcements, and ways you can report security issues to the
-TensorFlow team at the
-[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+Before using TensorFlow, please take a look at our [security model](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#tensorflow-models-are-programs),
+[list of recent security advisories and announcements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/index.md),
+and [ways you can report security issues](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#reporting-vulnerabilities)
+to the TensorFlow team at the [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
 
 ## Stay Informed
 
@@ -40,7 +40,7 @@
 
 ### Development Roadmap
 
-The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+The [Roadmap](../community/roadmap.md) summarizes plans for upcoming additions to TensorFlow.
 
 ### Social Media
 
@@ -54,7 +54,7 @@
 
 ### YouTube
 
-Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
+Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learning
 and AI with TensorFlow. On it we have a number of new shows, including:
 
 - TensorFlow Meets: meet with community contributors to learn and share what they're doing
@@ -70,12 +70,12 @@
 list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
 
 A number of other mailing lists exist, focused on different project areas, which
-can be found at @{$lists$TensorFlow Mailing Lists}.
+can be found at [TensorFlow Mailing Lists](../community/lists.md).
 
 ### User Groups
 
 To meet with like-minded people local to you, check out the many
-@{$groups$TensorFlow user groups} around the world.
+[TensorFlow user groups](../community/groups.md) around the world.
 
 
 ## Contributing To TensorFlow
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
index 7450ab3..bc2f573 100644
--- a/tensorflow/docs_src/community/lists.md
+++ b/tensorflow/docs_src/community/lists.md
@@ -32,6 +32,8 @@
   and peer support for TensorFlow.js.
 * [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and
   peer support for TensorFlow Lite.
+* [tfprobability](https://groups.google.com/a/tensorflow.org/d/forum/tfprobability) - Discussion and
+  peer support for TensorFlow Probability.
 * [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion
   and support for TPU users.
 
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index c926879..c78da20 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -47,27 +47,7 @@
 exports_files(["LICENSE"])
 ```
 
-* At the end of every BUILD file, should contain:
 
-```
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-```
-
-* When adding new BUILD file, add this line to `tensorflow/BUILD` file into `all_opensource_files` target.
-
-```
-"//tensorflow/<directory>:all_files",
-```
 
 * For all Python BUILD targets (libraries and tests) add next line:
 
@@ -80,6 +60,9 @@
 
 * Operations that deal with batches may assume that the first dimension of a Tensor is the batch dimension.
 
+* In most models the *last dimension* is the number of channels.
+
+* Dimensions excluding the first and last usually make up the "space" dimensions: Sequence-length or Image-size.
 
 ## Python operations
 
@@ -105,7 +88,7 @@
 * Operations should contain an extensive Python comment with Args and Returns
  declarations that explain both the type and meaning of each value. Possible
  shapes, dtypes, or ranks should be specified in the description.
- @{$documentation$See documentation details}
+ [See documentation details](../community/documentation.md)
 
 * For increased usability include an example of usage with inputs / outputs
  of the op in Example section.
@@ -148,37 +131,6 @@
 
 ## Layers
 
-A *Layer* is a Python operation that combines variable creation and/or one or many
-other graph operations. Follow the same requirements as for regular Python
-operation.
+Use `tf.keras.layers`, not `tf.layers`.
 
-* If a layer creates one or more variables, the layer function
- should take next arguments also following order:
-  - `initializers`: Optionally allow to specify initializers for the variables.
-  - `regularizers`: Optionally allow to specify regularizers for the variables.
-  - `trainable`: which control if their variables are trainable or not.
-  - `scope`: `VariableScope` object that variable will be put under.
-  - `reuse`: `bool` indicator if the variable should be reused if
-             it's present in the scope.
-
-* Layers that behave differently during training should take:
-  - `is_training`: `bool` indicator to conditionally choose different
-                   computation paths (e.g. using `tf.cond`) during execution.
-
-Example:
-
-    def conv2d(inputs,
-               num_filters_out,
-               kernel_size,
-               stride=1,
-               padding='SAME',
-               activation_fn=tf.nn.relu,
-               normalization_fn=add_bias,
-               normalization_params=None,
-               initializers=None,
-               regularizers=None,
-               trainable=True,
-               scope=None,
-               reuse=None):
-      ... see implementation at tensorflow/contrib/layers/python/layers/layers.py ...
-
+See `tf.keras.layers` and [the Keras guide](../guide/keras.md#custom_layers) for details on how to sub-class layers.
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index fc3a606..2fba36c 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -2,7 +2,7 @@
 
 This document shows how to create a cluster of TensorFlow servers, and how to
 distribute a computation graph across that cluster. We assume that you are
-familiar with the @{$guide/low_level_intro$basic concepts} of
+familiar with the [basic concepts](../guide/low_level_intro.md) of
 writing low level TensorFlow programs.
 
 ## Hello distributed TensorFlow!
@@ -21,7 +21,7 @@
 ```
 
 The
-@{tf.train.Server.create_local_server}
+`tf.train.Server.create_local_server`
 method creates a single-process cluster, with an in-process server.
 
 ## Create a cluster
@@ -55,7 +55,7 @@
 
 The cluster specification dictionary maps job names to lists of network
 addresses. Pass this dictionary to
-the @{tf.train.ClusterSpec}
+the `tf.train.ClusterSpec`
 constructor.  For example:
 
 <table>
@@ -84,10 +84,10 @@
 
 ### Create a `tf.train.Server` instance in each task
 
-A @{tf.train.Server} object contains a
+A `tf.train.Server` object contains a
 set of local devices, a set of connections to other tasks in its
 `tf.train.ClusterSpec`, and a
-@{tf.Session} that can use these
+`tf.Session` that can use these
 to perform a distributed computation. Each server is a member of a specific
 named job and has a task index within that job.  A server can communicate with
 any other server in the cluster.
@@ -117,7 +117,7 @@
 ## Specifying distributed devices in your model
 
 To place operations on a particular process, you can use the same
-@{tf.device}
+`tf.device`
 function that is used to specify whether ops run on the CPU or GPU. For example:
 
 ```python
@@ -165,7 +165,7 @@
   for each `/job:worker` task, typically in the same process as the worker
   task. Each client builds a similar graph containing the parameters (pinned to
   `/job:ps` as before using
-  @{tf.train.replica_device_setter}
+  `tf.train.replica_device_setter`
   to map them deterministically to the same tasks); and a single copy of the
   compute-intensive part of the model, pinned to the local task in
   `/job:worker`.
@@ -180,7 +180,7 @@
   gradient averaging as in the
   [CIFAR-10 multi-GPU trainer](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
   and between-graph replication (e.g. using the
-  @{tf.train.SyncReplicasOptimizer}).
+  `tf.train.SyncReplicasOptimizer`).
 
 ### Putting it all together: example trainer program
 
@@ -318,7 +318,7 @@
 one or more "tasks". A cluster is typically dedicated to a particular high-level
 objective, such as training a neural network, using many machines in parallel. A
 cluster is defined by
-a @{tf.train.ClusterSpec} object.
+a `tf.train.ClusterSpec` object.
 
 **Job**
 
@@ -344,7 +344,7 @@
 its index within that job's list of tasks.
 
 **TensorFlow server** A process running
-a @{tf.train.Server} instance, which is
+a `tf.train.Server` instance, which is
 a member of a cluster, and exports a "master service" and "worker service".
 
 **Worker service**
diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md
index c447156..b0d416d 100644
--- a/tensorflow/docs_src/deploy/hadoop.md
+++ b/tensorflow/docs_src/deploy/hadoop.md
@@ -6,7 +6,7 @@
 
 ## HDFS
 
-We assume that you are familiar with @{$reading_data$reading data}.
+We assume that you are familiar with [reading data](../api_guides/python/reading_data.md).
 
 To use HDFS with TensorFlow, change the file paths you use to read and write
 data to an HDFS path. For example:
@@ -61,5 +61,5 @@
     export KRB5CCNAME=/tmp/krb5cc_10002
     ```
 
-If you are running @{$distributed$Distributed TensorFlow}, then all
+If you are running [Distributed TensorFlow](../deploy/distributed.md), then all
 workers must have the environment variables set and Hadoop installed.
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 3322004..08b28de 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -3,11 +3,11 @@
 This section focuses on deploying real-world models.  It contains
 the following documents:
 
-  * @{$distributed$Distributed TensorFlow}, which explains how to create
+  * [Distributed TensorFlow](../deploy/distributed.md), which explains how to create
     a cluster of TensorFlow servers.
-  * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
+  * [How to run TensorFlow on Hadoop](../deploy/hadoop.md), which has a highly
     self-explanatory title.
-  * @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
+  * [How to run TensorFlow with the S3 filesystem](../deploy/s3.md), which explains how
     to run TensorFlow with the S3 file system.
   * The entire document set for [TensorFlow serving](/serving), an open-source,
     flexible, high-performance serving system for machine-learned models
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 7028249..b4a759d 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -40,7 +40,7 @@
 AWS_REGION=us-east-1                    # Region for the S3 bucket, this is not always needed. Default is us-east-1.
 S3_ENDPOINT=s3.us-east-1.amazonaws.com  # The S3 API Endpoint to connect to. This is specified in a HOST:PORT format.
 S3_USE_HTTPS=1                          # Whether or not to use HTTPS. Disable with 0.
-S3_VERIFY_SSL=1                         # If HTTPS is used, conterols if SSL should be enabled. Disable with 0.
+S3_VERIFY_SSL=1                         # If HTTPS is used, controls if SSL should be enabled. Disable with 0.
 ```
 
 ## Usage
@@ -64,7 +64,7 @@
 
 ### Reading Data
 
-When @{$reading_data$reading data}, change the file paths you use to read and write
+When [reading data](../api_guides/python/reading_data.md), change the file paths you use to read and write
 data to an S3 path. For example:
 
 ```python
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index bc0f662..5f8ac64 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -225,7 +225,7 @@
 Next, you must build a shared object containing this implementation. An example
 of doing so using bazel's `cc_binary` rule can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244),
-but you may use any build system to do so. See the section on @{$adding_an_op#build_the_op_library$building the op library} for similar
+but you may use any build system to do so. See the section on [building the op library](../extend/adding_an_op.md#build_the_op_library) for similar
 instructions.
 
 The result of building this target is a `.so` shared object file.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 1b028be..cc25ab9 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -46,7 +46,7 @@
 4.  Write a function to compute gradients for the op (optional).
 5.  Test the op. We usually do this in Python for convenience, but you can also
     test the op in C++. If you define gradients, you can verify them with the
-    Python @{tf.test.compute_gradient_error$gradient checker}.
+    Python `tf.test.compute_gradient_error`.
     See
     [`relu_op_test.py`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/relu_op_test.py) as
     an example that tests the forward functions of Relu-like operators and
@@ -56,8 +56,8 @@
 
 *   Some familiarity with C++.
 *   Must have installed the
-    @{$install$TensorFlow binary}, or must have
-    @{$install_sources$downloaded TensorFlow source},
+    [TensorFlow binary](../install/index.md), or must have
+    [downloaded TensorFlow source](../install/install_sources.md),
     and be able to build it.
 
 [TOC]
@@ -388,7 +388,7 @@
 ## Use the op in Python
 
 TensorFlow Python API provides the
-@{tf.load_op_library} function to
+`tf.load_op_library` function to
 load the dynamic library and register the op with the TensorFlow
 framework. `load_op_library` returns a Python module that contains the Python
 wrappers for the op and the kernel. Thus, once you have built the op, you can
@@ -538,7 +538,7 @@
 ```
 
 (Note that the set of [attribute types](#attr_types) is different from the
-@{tf.DType$tensor types} used for inputs and outputs.)
+`tf.DType` used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
@@ -615,7 +615,7 @@
 
 * `{<type1>, <type2>}`: The value is of type `type`, and must be one of
   `<type1>` or `<type2>`, where `<type1>` and `<type2>` are supported
-  @{tf.DType$tensor types}.  You don't specify
+  `tf.DType`.  You don't specify
   that the type of the attr is `type`. This is implied when you have a list of
   types in `{...}`.  For example, in this case the attr `t` is a type that must
   be an `int32`, a `float`, or a `bool`:
@@ -649,7 +649,7 @@
     ```
 
     Lists can be combined with other lists and single types.  The following
-    op allows attr `t` to be any of the numberic types, or the bool type:
+    op allows attr `t` to be any of the numeric types, or the bool type:
 
     ```c++
     REGISTER_OP("NumberOrBooleanType")
@@ -714,7 +714,7 @@
 ```
 
 Note in particular that the values of type `type`
-use @{tf.DType$the `DT_*` names for the types}.
+use `tf.DType`.
 
 #### Polymorphism
 
@@ -1056,7 +1056,7 @@
   `string`). This specifies a single tensor of the given type.
 
   See
-  @{tf.DType$the list of supported Tensor types}.
+  `tf.DType`.
 
   ```c++
   REGISTER_OP("BuiltInTypesExample")
@@ -1098,8 +1098,7 @@
 
 * For a sequence of tensors with the same type: `<number> * <type>`, where
   `<number>` is the name of an [Attr](#attrs) with type `int`.  The `<type>` can
-  either be
-  @{tf.DType$a specific type like `int32` or `float`},
+  either be a `tf.DType`,
   or the name of an attr with type `type`.  As an example of the first, this
   op accepts a list of `int32` tensors:
 
@@ -1141,7 +1140,7 @@
 backwards-compatible: changing the specification of an op must not break prior
 serialized `GraphDef` protocol buffers constructed from older specifications.
 The details of `GraphDef` compatibility are
-@{$version_compat#compatibility_of_graphs_and_checkpoints$described here}.
+[described here](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints).
 
 There are several ways to preserve backwards-compatibility.
 
@@ -1191,7 +1190,7 @@
 hand-written Python wrapper, by keeping the old signature except possibly adding
 new optional arguments to the end.  Generally incompatible changes may only be
 made when TensorFlow's changes major versions, and must conform to the
-@{$version_compat#compatibility_of_graphs_and_checkpoints$`GraphDef` version semantics}.
+[`GraphDef` version semantics](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints).
 
 ### GPU Support
 
@@ -1202,7 +1201,7 @@
 Notice some kernels have a CPU version in a `.cc` file, a GPU version in a file
 ending in `_gpu.cu.cc`, and some code shared in common in a `.h` file.
 
-For example, the @{tf.pad} has
+For example, the `tf.pad` has
 everything but the GPU kernel in [`tensorflow/core/kernels/pad_op.cc`][pad_op].
 The GPU kernel is in
 [`tensorflow/core/kernels/pad_op_gpu.cu.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/pad_op_gpu.cu.cc),
@@ -1263,7 +1262,7 @@
 Given a graph of ops, TensorFlow uses automatic differentiation
 (backpropagation) to add new ops representing gradients with respect to the
 existing ops (see
-@{$python/train#gradient_computation$Gradient Computation}).
+[Gradient Computation](../api_guides/python/train.md#gradient_computation)).
 To make automatic differentiation work for new ops, you must register a gradient
 function which computes gradients with respect to the ops' inputs given
 gradients with respect to the ops' outputs.
@@ -1307,16 +1306,16 @@
 ```
 
 Details about registering gradient functions with
-@{tf.RegisterGradient}:
+`tf.RegisterGradient`:
 
 * For an op with one output, the gradient function will take an
-  @{tf.Operation} `op` and a
-  @{tf.Tensor} `grad` and build new ops
+  `tf.Operation` `op` and a
+  `tf.Tensor` `grad` and build new ops
   out of the tensors
   [`op.inputs[i]`](../../api_docs/python/framework.md#Operation.inputs),
   [`op.outputs[i]`](../../api_docs/python/framework.md#Operation.outputs), and `grad`.  Information
   about any attrs can be found via
-  @{tf.Operation.get_attr}.
+  `tf.Operation.get_attr`.
 
 * If the op has multiple outputs, the gradient function will take `op` and
   `grads`, where `grads` is a list of gradients with respect to each output.
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index 84435a5..eb33336 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -7,8 +7,8 @@
 This document describes the system architecture that makes this
 combination of scale and flexibility possible. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
-and sessions. See @{$guide/low_level_intro$this document} for an introduction to
-these topics. Some familiarity with @{$distributed$distributed TensorFlow}
+and sessions. See [this document](../guide/low_level_intro.md) for an introduction to
+these topics. Some familiarity with [distributed TensorFlow](../deploy/distributed.md)
 will also be helpful.
 
 This document is for developers who want to extend TensorFlow in some way not
@@ -81,7 +81,7 @@
 still Python-only, but C++ does have support for efficient inference.
 
 The client creates a session, which sends the graph definition to the
-distributed master as a @{tf.GraphDef}
+distributed master as a `tf.GraphDef`
 protocol buffer. When the client evaluates a node or nodes in the
 graph, the evaluation triggers a call to the distributed master to initiate
 computation.
@@ -96,7 +96,7 @@
 
 ### Code
 
-*  @{tf.Session}
+*  `tf.Session`
 
 ## Distributed master
 
@@ -199,7 +199,7 @@
 C++ templates to generate efficient parallel code for multicore CPUs and GPUs;
 however, we liberally use libraries like cuDNN where a more efficient kernel
 implementation is possible. We have also implemented
-@{$quantization$quantization}, which enables
+[quantization](../performance/quantization.md), which enables
 faster inference in environments such as mobile devices and high-throughput
 datacenter applications, and use the
 [gemmlowp](https://github.com/google/gemmlowp) low-precision matrix library to
@@ -209,7 +209,7 @@
 of operations, users can register additional kernels that provide an efficient
 implementation written in C++. For example, we recommend registering your own
 fused kernels for some performance critical operations, such as the ReLU and
-Sigmoid activation functions and their corresponding gradients. The @{$xla$XLA Compiler} has an
+Sigmoid activation functions and their corresponding gradients. The [XLA Compiler](../performance/xla/index.md) has an
 experimental implementation of automatic kernel fusion.
 
 ### Code
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index d48340a..bbf4a81 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -3,32 +3,32 @@
 This section explains how developers can add functionality to TensorFlow's
 capabilities. Begin by reading the following architectural overview:
 
-  * @{$architecture$TensorFlow Architecture}
+  * [TensorFlow Architecture](../extend/architecture.md)
 
 The following guides explain how to extend particular aspects of
 TensorFlow:
 
-  * @{$adding_an_op$Adding a New Op}, which explains how to create your own
+  * [Adding a New Op](../extend/adding_an_op.md), which explains how to create your own
     operations.
-  * @{$add_filesys$Adding a Custom Filesystem Plugin}, which explains how to
+  * [Adding a Custom Filesystem Plugin](../extend/add_filesys.md), which explains how to
     add support for your own shared or distributed filesystem.
-  * @{$new_data_formats$Custom Data Readers}, which details how to add support
+  * [Custom Data Readers](../extend/new_data_formats.md), which details how to add support
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
 promises. However, TensorFlow also provides functionality in C++, Go, Java and
-[JavaScript](https://js.tensorflow.org) (incuding
+[JavaScript](https://js.tensorflow.org) (including
 [Node.js](https://github.com/tensorflow/tfjs-node)),
 plus community support for [Haskell](https://github.com/tensorflow/haskell) and
 [Rust](https://github.com/tensorflow/rust). If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
-  * @{$language_bindings$TensorFlow in Other Languages}
+  * [TensorFlow in Other Languages](../extend/language_bindings.md)
 
 To create tools compatible with TensorFlow's model format, read the following
 guide:
 
-  * @{$tool_developers$A Tool Developer's Guide to TensorFlow Model Files}
+  * [A Tool Developer's Guide to TensorFlow Model Files](../extend/tool_developers/index.md)
 
 
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index 9a968d3..4727eab 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -125,7 +125,7 @@
     instead of CamelCase for the op's function name.
 -   A list of inputs and outputs. The types for these may be polymorphic by
     referencing attributes, as described in the inputs and outputs section of
-    @{$adding_an_op$Adding an     op}.
+    [Adding an     op](../extend/adding_an_op.md).
 -   A list of attributes, along with their default values (if any). Note that
     some of these will be inferred (if they are determined by an input), some
     will be optional (if they have a default), and some will be required (no
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index abbf479..7ca50c9 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -4,7 +4,7 @@
 
 *   Some familiarity with C++.
 *   Must have
-    @{$install_sources$downloaded TensorFlow source}, and be
+    [downloaded TensorFlow source](../install/install_sources.md), and be
     able to build it.
 
 We divide the task of supporting a file format into two pieces:
@@ -15,25 +15,24 @@
 *   Record formats: We use decoder or parsing ops to turn a string record
     into tensors usable by TensorFlow.
 
-For example, to read a
-[CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-@{tf.data.TextLineDataset$a dataset for reading text files line-by-line}
-and then @{tf.data.Dataset.map$map} an
-@{tf.decode_csv$op} that parses CSV data from each line of text in the dataset.
+For example, to re-implement `tf.contrib.data.make_csv_dataset` function, we
+could use `tf.data.TextLineDataset` to extract the records, and then
+use `tf.data.Dataset.map` and `tf.decode_csv` to parses the CSV records from
+each line of text in the dataset.
 
 [TOC]
 
 ## Writing a `Dataset` for a file format
 
-A @{tf.data.Dataset} represents a sequence of *elements*, which can be the
+A `tf.data.Dataset` represents a sequence of *elements*, which can be the
 individual records in a file. There are several examples of "reader" datasets
 that are already built into TensorFlow:
 
-*   @{tf.data.TFRecordDataset}
+*   `tf.data.TFRecordDataset`
     ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
-*   @{tf.data.FixedLengthRecordDataset}
+*   `tf.data.FixedLengthRecordDataset`
     ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
-*   @{tf.data.TextLineDataset}
+*   `tf.data.TextLineDataset`
     ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
 
 Each of these implementations comprises three related classes:
@@ -64,11 +63,11 @@
    that implement the reading logic.
 2. In C++, register a new reader op and kernel with the name
    `"MyReaderDataset"`.
-3. In Python, define a subclass of @{tf.data.Dataset} called `MyReaderDataset`.
+3. In Python, define a subclass of `tf.data.Dataset` called `MyReaderDataset`.
 
 You can put all the C++ code in a single file, such as
 `my_reader_dataset_op.cc`. It will help if you are
-familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+familiar with [the adding an op how-to](../extend/adding_an_op.md). The following skeleton
 can be used as a starting point for your implementation:
 
 ```c++
@@ -228,9 +227,9 @@
 ```
 
 The last step is to build the C++ code and add a Python wrapper. The easiest way
-to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
-library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
-that subclasses @{tf.data.Dataset} to wrap it. An example Python program is
+to do this is by [compiling a dynamic
+library](../extend/adding_an_op.md#build_the_op_library) (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+that subclasses `tf.data.Dataset` to wrap it. An example Python program is
 given here:
 
 ```python
@@ -286,21 +285,21 @@
 ## Writing an Op for a record format
 
 Generally this is an ordinary op that takes a scalar string record as input, and
-so follow @{$adding_an_op$the instructions to add an Op}.
+so follow [the instructions to add an Op](../extend/adding_an_op.md).
 You may optionally take a scalar string key as input, and include that in error
 messages reporting improperly formatted data.  That way users can more easily
 track down where the bad data came from.
 
 Examples of Ops useful for decoding records:
 
-*   @{tf.parse_single_example} (and @{tf.parse_example})
-*   @{tf.decode_csv}
-*   @{tf.decode_raw}
+*   `tf.parse_single_example` (and `tf.parse_example`)
+*   `tf.decode_csv`
+*   `tf.decode_raw`
 
 Note that it can be useful to use multiple Ops to decode a particular record
 format.  For example, you may have an image saved as a string in
 [a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 Depending on the format of that image, you might take the corresponding output
-from a @{tf.parse_single_example} op and call @{tf.image.decode_jpeg},
-@{tf.image.decode_png}, or @{tf.decode_raw}.  It is common to take the output
-of `tf.decode_raw` and use @{tf.slice} and @{tf.reshape} to extract pieces.
+from a `tf.parse_single_example` op and call `tf.image.decode_jpeg`,
+`tf.image.decode_png`, or `tf.decode_raw`.  It is common to take the output
+of `tf.decode_raw` and use `tf.slice` and `tf.reshape` to extract pieces.
diff --git a/tensorflow/docs_src/guide/checkpoints.md b/tensorflow/docs_src/guide/checkpoints.md
index dfb2626..3c92cbb 100644
--- a/tensorflow/docs_src/guide/checkpoints.md
+++ b/tensorflow/docs_src/guide/checkpoints.md
@@ -9,13 +9,13 @@
     the model.
 
 This document focuses on checkpoints. For details on `SavedModel`, see the
-@{$saved_model$Saving and Restoring} guide.
+[Saving and Restoring](../guide/saved_model.md) guide.
 
 
 ## Sample code
 
 This document relies on the same
-[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in [Getting Started with TensorFlow](../guide/premade_estimators.md).
 To download and access the example, invoke the following two commands:
 
 ```shell
@@ -129,7 +129,7 @@
 
 You may alter the default schedule by taking the following steps:
 
-1.  Create a @{tf.estimator.RunConfig$`RunConfig`} object that defines the
+1.  Create a `tf.estimator.RunConfig` object that defines the
     desired schedule.
 2.  When instantiating the Estimator, pass that `RunConfig` object to the
     Estimator's `config` argument.
@@ -160,7 +160,7 @@
 1.  The Estimator builds the model's
     [graph](https://developers.google.com/machine-learning/glossary/#graph)
     by running the `model_fn()`.  (For details on the `model_fn()`, see
-    @{$custom_estimators$Creating Custom Estimators.})
+    [Creating Custom Estimators.](../guide/custom_estimators.md))
 2.  The Estimator initializes the weights of the new model from the data
     stored in the most recent checkpoint.
 
@@ -231,7 +231,7 @@
 Checkpoints provide an easy automatic mechanism for saving and restoring
 models created by Estimators.
 
-See the @{$saved_model$Saving and Restoring} guide for details about:
+See the [Saving and Restoring](../guide/saved_model.md) guide for details about:
 
 *   Saving and restoring models using low-level TensorFlow APIs.
 *   Exporting and importing models in the SavedModel format, which is a
diff --git a/tensorflow/docs_src/guide/custom_estimators.md b/tensorflow/docs_src/guide/custom_estimators.md
index 6e4ef2e..913a359 100644
--- a/tensorflow/docs_src/guide/custom_estimators.md
+++ b/tensorflow/docs_src/guide/custom_estimators.md
@@ -2,10 +2,10 @@
 # Creating Custom Estimators
 
 This document introduces custom Estimators. In particular, this document
-demonstrates how to create a custom @{tf.estimator.Estimator$Estimator} that
+demonstrates how to create a custom `tf.estimator.Estimator` that
 mimics the behavior of the pre-made Estimator
-@{tf.estimator.DNNClassifier$`DNNClassifier`} in solving the Iris problem. See
-the @{$premade_estimators$Pre-Made Estimators chapter} for details
+`tf.estimator.DNNClassifier` in solving the Iris problem. See
+the [Pre-Made Estimators chapter](../guide/premade_estimators.md) for details
 on the Iris problem.
 
 To download and access the example code invoke the following two commands:
@@ -34,7 +34,7 @@
 ## Pre-made vs. custom
 
 As the following figure shows, pre-made Estimators are subclasses of the
-@{tf.estimator.Estimator} base class, while custom Estimators are an instance
+`tf.estimator.Estimator` base class, while custom Estimators are an instance
 of tf.estimator.Estimator:
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -84,7 +84,7 @@
 ## Write an Input function
 
 Our custom Estimator implementation uses the same input function as our
-@{$premade_estimators$pre-made Estimator implementation}, from
+[pre-made Estimator implementation](../guide/premade_estimators.md), from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py).
 Namely:
 
@@ -106,8 +106,8 @@
 
 ## Create feature columns
 
-As detailed in the @{$premade_estimators$Premade Estimators} and
-@{$feature_columns$Feature Columns} chapters, you must define
+As detailed in the [Premade Estimators](../guide/premade_estimators.md) and
+[Feature Columns](../guide/feature_columns.md) chapters, you must define
 your model's feature columns to specify how the model should use each feature.
 Whether working with pre-made Estimators or custom Estimators, you define
 feature columns in the same fashion.
@@ -144,8 +144,8 @@
 to the constructor are in turn passed on to the `model_fn`. In
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
 the following lines create the estimator and set the params to configure the
-model. This configuration step is similar to how we configured the @{tf.estimator.DNNClassifier} in
-@{$premade_estimators}.
+model. This configuration step is similar to how we configured the `tf.estimator.DNNClassifier` in
+[Premade Estimators](../guide/premade_estimators.md).
 
 ```python
 classifier = tf.estimator.Estimator(
@@ -178,7 +178,7 @@
 
 ### Define the input layer
 
-The first line of the `model_fn` calls @{tf.feature_column.input_layer} to
+The first line of the `model_fn` calls `tf.feature_column.input_layer` to
 convert the feature dictionary and `feature_columns` into input for your model,
 as follows:
 
@@ -202,7 +202,7 @@
 If you are creating a deep neural network, you must define one or more hidden
 layers. The Layers API provides a rich set of functions to define all types of
 hidden layers, including convolutional, pooling, and dropout layers. For Iris,
-we're simply going to call @{tf.layers.dense} to create hidden layers, with
+we're simply going to call `tf.layers.dense` to create hidden layers, with
 dimensions defined by `params['hidden_layers']`. In a `dense` layer each node
 is connected to every node in the preceding layer.  Here's the relevant code:
 
@@ -231,14 +231,14 @@
   src="../images/custom_estimators/add_hidden_layer.png">
 </div>
 
-Note that @{tf.layers.dense} provides many additional capabilities, including
+Note that `tf.layers.dense` provides many additional capabilities, including
 the ability to set a multitude of regularization parameters. For the sake of
 simplicity, though, we're going to simply accept the default values of the
 other parameters.
 
 ### Output Layer
 
-We'll define the output layer by calling @{tf.layers.dense} yet again, this
+We'll define the output layer by calling `tf.layers.dense` yet again, this
 time without an activation function:
 
 ```python
@@ -265,7 +265,7 @@
 Versicolor, or Virginica, respectively.
 
 Later on, these logits will be transformed into probabilities by the
-@{tf.nn.softmax} function.
+`tf.nn.softmax` function.
 
 ## Implement training, evaluation, and prediction {#modes}
 
@@ -290,9 +290,9 @@
 
 | Estimator method                 |    Estimator Mode |
 |:---------------------------------|:------------------|
-|@{tf.estimator.Estimator.train$`train()`} |@{tf.estimator.ModeKeys.TRAIN$`ModeKeys.TRAIN`} |
-|@{tf.estimator.Estimator.evaluate$`evaluate()`}  |@{tf.estimator.ModeKeys.EVAL$`ModeKeys.EVAL`}      |
-|@{tf.estimator.Estimator.predict$`predict()`}|@{tf.estimator.ModeKeys.PREDICT$`ModeKeys.PREDICT`} |
+|`tf.estimator.Estimator.train` |`tf.estimator.ModeKeys.TRAIN` |
+|`tf.estimator.Estimator.evaluate`  |`tf.estimator.ModeKeys.EVAL`      |
+|`tf.estimator.Estimator.predict`|`tf.estimator.ModeKeys.PREDICT` |
 
 For example, suppose you instantiate a custom Estimator to generate an object
 named `classifier`. Then, you make the following call:
@@ -350,8 +350,8 @@
 *   `logit` holds the raw logit values (in this example, -1.3, 2.6, and -0.9)
 
 We return that dictionary to the caller via the `predictions` parameter of the
-@{tf.estimator.EstimatorSpec}. The Estimator's
-@{tf.estimator.Estimator.predict$`predict`} method will yield these
+`tf.estimator.EstimatorSpec`. The Estimator's
+`tf.estimator.Estimator.predict` method will yield these
 dictionaries.
 
 ### Calculate the loss
@@ -361,7 +361,7 @@
 [objective](https://developers.google.com/machine-learning/glossary/#objective)
 that will be optimized.
 
-We can calculate the loss by calling @{tf.losses.sparse_softmax_cross_entropy}.
+We can calculate the loss by calling `tf.losses.sparse_softmax_cross_entropy`.
 The value returned by this function will be approximately 0 at lowest,
 when the probability of the correct class (at index `label`) is near 1.0.
 The loss value returned is progressively larger as the probability of the
@@ -382,12 +382,12 @@
 or more metrics.
 
 Although returning metrics is optional, most custom Estimators do return at
-least one metric. TensorFlow provides a Metrics module @{tf.metrics} to
+least one metric. TensorFlow provides a Metrics module `tf.metrics` to
 calculate common metrics.  For brevity's sake, we'll only return accuracy. The
-@{tf.metrics.accuracy} function compares our predictions against the
+`tf.metrics.accuracy` function compares our predictions against the
 true values, that is, against the labels provided by the input function. The
-@{tf.metrics.accuracy} function requires the labels and predictions to have the
-same shape. Here's the call to @{tf.metrics.accuracy}:
+`tf.metrics.accuracy` function requires the labels and predictions to have the
+same shape. Here's the call to `tf.metrics.accuracy`:
 
 ``` python
 # Compute evaluation metrics.
@@ -396,7 +396,7 @@
                                name='acc_op')
 ```
 
-The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for evaluation
+The `tf.estimator.EstimatorSpec` returned for evaluation
 typically contains the following information:
 
 * `loss`, which is the model's loss
@@ -416,7 +416,7 @@
         mode, loss=loss, eval_metric_ops=metrics)
 ```
 
-The @{tf.summary.scalar} will make accuracy available to TensorBoard
+The `tf.summary.scalar` will make accuracy available to TensorBoard
 in both `TRAIN` and `EVAL` modes. (More on this later).
 
 ### Train
@@ -426,7 +426,7 @@
 `EstimatorSpec` that contains the loss and a training operation.
 
 Building the training operation will require an optimizer. We will use
-@{tf.train.AdagradOptimizer} because we're mimicking the `DNNClassifier`, which
+`tf.train.AdagradOptimizer` because we're mimicking the `DNNClassifier`, which
 also uses `Adagrad` by default. The `tf.train` package provides many other
 optimizers—feel free to experiment with them.
 
@@ -437,14 +437,14 @@
 ```
 
 Next, we build the training operation using the optimizer's
-@{tf.train.Optimizer.minimize$`minimize`} method on the loss we calculated
+`tf.train.Optimizer.minimize` method on the loss we calculated
 earlier.
 
 The `minimize` method also takes a `global_step` parameter. TensorFlow uses this
 parameter to count the number of training steps that have been processed
 (to know when to end a training run). Furthermore, the `global_step` is
 essential for TensorBoard graphs to work correctly. Simply call
-@{tf.train.get_global_step} and pass the result to the `global_step`
+`tf.train.get_global_step` and pass the result to the `global_step`
 argument of `minimize`.
 
 Here's the code to train the model:
@@ -453,7 +453,7 @@
 train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
 ```
 
-The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for training
+The `tf.estimator.EstimatorSpec` returned for training
 must have the following fields set:
 
 * `loss`, which contains the value of the loss function.
@@ -489,7 +489,7 @@
 
 The rest of the code to train, evaluate, and generate predictions using our
 Estimator is the same as in the
-@{$premade_estimators$Premade Estimators} chapter. For
+[Premade Estimators](../guide/premade_estimators.md) chapter. For
 example, the following line will train the model:
 
 ```python
@@ -597,6 +597,6 @@
   which contains more curated examples using custom estimators.
 * This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces
   TensorBoard.
-* The @{$low_level_intro$Low Level Introduction}, which demonstrates
+* The [Low Level Introduction](../guide/low_level_intro.md), which demonstrates
   how to experiment directly with TensorFlow's low level APIs, making debugging
   easier.
diff --git a/tensorflow/docs_src/guide/datasets.md b/tensorflow/docs_src/guide/datasets.md
index 8b69860..bf77550 100644
--- a/tensorflow/docs_src/guide/datasets.md
+++ b/tensorflow/docs_src/guide/datasets.md
@@ -1,6 +1,6 @@
 # Importing Data
 
-The @{tf.data} API enables you to build complex input pipelines from
+The `tf.data` API enables you to build complex input pipelines from
 simple, reusable pieces. For example, the pipeline for an image model might
 aggregate data from files in a distributed file system, apply random
 perturbations to each image, and merge randomly selected images into a batch
@@ -51,7 +51,7 @@
 chaining method calls on the `tf.data.Dataset` object. For example, you
 can apply per-element transformations such as `Dataset.map()` (to apply a
 function to each element), and multi-element transformations such as
-`Dataset.batch()`. See the documentation for @{tf.data.Dataset}
+`Dataset.batch()`. See the documentation for `tf.data.Dataset`
 for a complete list of transformations.
 
 The most common way to consume values from a `Dataset` is to make an
@@ -211,13 +211,13 @@
     sess.run(next_element)
 ```
 
-A **feedable** iterator can be used together with @{tf.placeholder} to select
-what `Iterator` to use in each call to @{tf.Session.run}, via the familiar
+A **feedable** iterator can be used together with `tf.placeholder` to select
+what `Iterator` to use in each call to `tf.Session.run`, via the familiar
 `feed_dict` mechanism. It offers the same functionality as a reinitializable
 iterator, but it does not require you to initialize the iterator from the start
 of a dataset when you switch between iterators. For example, using the same
 training and validation example from above, you can use
-@{tf.data.Iterator.from_string_handle} to define a feedable iterator
+`tf.data.Iterator.from_string_handle` to define a feedable iterator
 that allows you to switch between the two datasets:
 
 ```python
@@ -329,13 +329,13 @@
 
 ### Saving iterator state
 
-The @{tf.contrib.data.make_saveable_from_iterator} function creates a
+The `tf.contrib.data.make_saveable_from_iterator` function creates a
 `SaveableObject` from an iterator, which can be used to save and
 restore the current state of the iterator (and, effectively, the whole input
-pipeline). A saveable object thus created can be added to @{tf.train.Saver}
+pipeline). A saveable object thus created can be added to `tf.train.Saver`
 variables list or the `tf.GraphKeys.SAVEABLE_OBJECTS` collection for saving and
-restoring in the same manner as a @{tf.Variable}. Refer to
-@{$saved_model$Saving and Restoring} for details on how to save and restore
+restoring in the same manner as a `tf.Variable`. Refer to
+[Saving and Restoring](../guide/saved_model.md) for details on how to save and restore
 variables.
 
 ```python
@@ -488,7 +488,7 @@
 ### Consuming CSV data
 
 The CSV file format is a popular format for storing tabular data in plain text.
-The @{tf.contrib.data.CsvDataset} class provides a way to extract records from
+The `tf.contrib.data.CsvDataset` class provides a way to extract records from
 one or more CSV files that comply with [RFC 4180](https://tools.ietf.org/html/rfc4180).
 Given one or more filenames and a list of defaults, a `CsvDataset` will produce
 a tuple of elements whose types correspond to the types of the defaults
@@ -757,9 +757,9 @@
 
 ### Using high-level APIs
 
-The @{tf.train.MonitoredTrainingSession} API simplifies many aspects of running
+The `tf.train.MonitoredTrainingSession` API simplifies many aspects of running
 TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the
-@{tf.errors.OutOfRangeError} to signal that training has completed, so to use it
+`tf.errors.OutOfRangeError` to signal that training has completed, so to use it
 with the `tf.data` API, we recommend using
 `Dataset.make_one_shot_iterator()`. For example:
 
@@ -782,7 +782,7 @@
     sess.run(training_op)
 ```
 
-To use a `Dataset` in the `input_fn` of a @{tf.estimator.Estimator}, we also
+To use a `Dataset` in the `input_fn` of a `tf.estimator.Estimator`, we also
 recommend using `Dataset.make_one_shot_iterator()`. For example:
 
 ```python
diff --git a/tensorflow/docs_src/guide/datasets_for_estimators.md b/tensorflow/docs_src/guide/datasets_for_estimators.md
index b55a573..09a3830 100644
--- a/tensorflow/docs_src/guide/datasets_for_estimators.md
+++ b/tensorflow/docs_src/guide/datasets_for_estimators.md
@@ -1,6 +1,6 @@
 # Datasets for Estimators
 
-The @{tf.data} module contains a collection of classes that allows you to
+The `tf.data` module contains a collection of classes that allows you to
 easily load data, manipulate it, and pipe it into your model. This document
 introduces the API by walking through two simple examples:
 
@@ -14,7 +14,7 @@
 
 Taking slices from an array is the simplest way to get started with `tf.data`.
 
-The @{$premade_estimators$Premade Estimators} chapter describes
+The [Premade Estimators](../guide/premade_estimators.md) chapter describes
 the following `train_input_fn`, from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py),
 to pipe the data into the Estimator:
@@ -73,8 +73,8 @@
 
 ### Slices
 
-The function starts by using the @{tf.data.Dataset.from_tensor_slices} function
-to create a @{tf.data.Dataset} representing slices of the array. The array is
+The function starts by using the `tf.data.Dataset.from_tensor_slices` function
+to create a `tf.data.Dataset` representing slices of the array. The array is
 sliced across the first dimension. For example, an array containing the
 MNIST training data has a shape of `(60000, 28, 28)`. Passing this to
 `from_tensor_slices` returns a `Dataset` object containing 60000 slices, each one
@@ -91,8 +91,8 @@
 ```
 
 This will print the following line, showing the
-@{$guide/tensors#shapes$shapes} and
-@{$guide/tensors#data_types$types} of the items in
+[shapes](../guide/tensors.md#shapes) and
+[types](../guide/tensors.md#data_types) of the items in
 the dataset. Note that a `Dataset` does not know how many items it contains.
 
 ``` None
@@ -128,7 +128,7 @@
 
 Here we see that when a `Dataset` contains structured elements, the `shapes`
 and `types` of the `Dataset` take on the same structure. This dataset contains
-dictionaries of @{$guide/tensors#rank$scalars}, all of type
+dictionaries of [scalars](../guide/tensors.md#rank), all of type
 `tf.float64`.
 
 The first line of the iris `train_input_fn` uses the same functionality, but
@@ -170,15 +170,15 @@
 dataset = dataset.shuffle(1000).repeat().batch(batch_size)
 ```
 
-The @{tf.data.Dataset.shuffle$`shuffle`} method uses a fixed-size buffer to
+The `tf.data.Dataset.shuffle` method uses a fixed-size buffer to
 shuffle the items as they pass through. In this case the `buffer_size` is
 greater than the number of examples in the `Dataset`, ensuring that the data is
 completely shuffled (The Iris data set only contains 150 examples).
 
-The @{tf.data.Dataset.repeat$`repeat`} method restarts the `Dataset` when
+The `tf.data.Dataset.repeat` method restarts the `Dataset` when
 it reaches the end. To limit the number of epochs, set the `count` argument.
 
-The @{tf.data.Dataset.batch$`batch`} method collects a number of examples and
+The `tf.data.Dataset.batch` method collects a number of examples and
 stacks them, to create batches. This adds a dimension to their shape. The new
 dimension is added as the first dimension. The following code uses
 the `batch` method on the MNIST `Dataset`, from earlier. This results in a
@@ -234,7 +234,7 @@
 ## Reading a CSV File
 
 The most common real-world use case for the `Dataset` class is to stream data
-from files on disk. The @{tf.data} module includes a variety of
+from files on disk. The `tf.data` module includes a variety of
 file readers. Let's see how parsing the Iris dataset from the csv file looks
 using a `Dataset`.
 
@@ -255,9 +255,9 @@
 
 ### Build the `Dataset`
 
-We start by building a @{tf.data.TextLineDataset$`TextLineDataset`} object to
+We start by building a `tf.data.TextLineDataset` object to
 read the file one line at a time. Then, we call the
-@{tf.data.Dataset.skip$`skip`} method to skip over the first line of the file, which contains a header, not an example:
+`tf.data.Dataset.skip` method to skip over the first line of the file, which contains a header, not an example:
 
 ``` python
 ds = tf.data.TextLineDataset(train_path).skip(1)
@@ -268,11 +268,11 @@
 We will start by building a function to parse a single line.
 
 The following `iris_data.parse_line` function accomplishes this task using the
-@{tf.decode_csv} function, and some simple python code:
+`tf.decode_csv` function, and some simple python code:
 
 We must parse each of the lines in the dataset in order to generate the
 necessary `(features, label)` pairs. The following `_parse_line` function
-calls @{tf.decode_csv} to parse a single line into its features
+calls `tf.decode_csv` to parse a single line into its features
 and the label. Since Estimators require that features be represented as a
 dictionary, we rely on Python's built-in `dict` and `zip` functions to build
 that dictionary.  The feature names are the keys of that dictionary.
@@ -301,7 +301,7 @@
 ### Parse the lines
 
 Datasets have many methods for manipulating the data while it is being piped
-to a model. The most heavily-used method is @{tf.data.Dataset.map$`map`}, which
+to a model. The most heavily-used method is `tf.data.Dataset.map`, which
 applies a transformation to each element of the `Dataset`.
 
 The `map` method takes a `map_func` argument that describes how each item in the
@@ -311,7 +311,7 @@
 <img style="width:100%" src="../images/datasets/map.png">
 </div>
 <div style="text-align: center">
-The @{tf.data.Dataset.map$`map`} method applies the `map_func` to
+The `tf.data.Dataset.map` method applies the `map_func` to
 transform each item in the <code>Dataset</code>.
 </div>
 
@@ -377,11 +377,11 @@
 Estimator. Consider the following documents next:
 
 
-* @{$custom_estimators}, which demonstrates how to build your own
+* [Creating Custom Estimators](../guide/custom_estimators.md), which demonstrates how to build your own
   custom `Estimator` model.
-* The @{$low_level_intro#datasets$Low Level Introduction}, which demonstrates
+* The [Low Level Introduction](../guide/low_level_intro.md#datasets), which demonstrates
   how to experiment directly with `tf.data.Datasets` using TensorFlow's low
   level APIs.
-* @{$guide/datasets} which goes into great detail about additional
+* [Importing Data](../guide/datasets.md) which goes into great detail about additional
   functionality of `Datasets`.
 
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index f0e4652..5af2747 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -89,22 +89,20 @@
 the diagnosis of issues.
 
 In this example, we have already registered a tensor filter called
-@{tfdbg.has_inf_or_nan},
+`tfdbg.has_inf_or_nan`,
 which simply determines if there are any `nan` or `inf` values in any
 intermediate tensors (tensors that are neither inputs or outputs of the
 `Session.run()` call, but are in the path leading from the inputs to the
 outputs). This filter is for `nan`s and `inf`s is a common enough use case that
 we ship it with the
-@{$python/tfdbg#Classes_for_debug_dump_data_and_directories$`debug_data`}
+[`debug_data`](../api_guides/python/tfdbg.md#Classes_for_debug_dump_data_and_directories)
 module.
 
-Note: You can also write your own custom filters. See
-the @{tfdbg.DebugDumpDir.find$API documentation}
-of `DebugDumpDir.find()` for additional information.
+Note: You can also write your own custom filters. See `tfdbg.DebugDumpDir.find`
+for additional information.
 
 ## Debugging Model Training with tfdbg
 
-
 Let's try training the model again, but with the `--debug` flag added this time:
 
 ```none
@@ -429,9 +427,9 @@
 debug them by using special `hook`s provided by `tfdbg`.
 
 `tfdbg` can debug the
-@{tf.estimator.Estimator.train$`train()`},
-@{tf.estimator.Estimator.evaluate$`evaluate()`} and
-@{tf.estimator.Estimator.predict$`predict()`}
+`tf.estimator.Estimator.train`,
+`tf.estimator.Estimator.evaluate` and
+`tf.estimator.Estimator.predict`
 methods of tf-learn `Estimator`s. To debug `Estimator.train()`,
 create a `LocalCLIDebugHook` and supply it in the `hooks` argument. For example:
 
@@ -473,7 +471,7 @@
 The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be
 used to flexibly specify what `Tensor`s to watch on different `Session.run()`
 calls, as a function of the `fetches` and `feed_dict` and other states. See
-@{tfdbg.DumpingDebugWrapperSession.__init__$this API doc}
+`tfdbg.DumpingDebugWrapperSession.__init__`
 for more details.
 
 ## Debugging Keras Models with TFDBG
@@ -556,7 +554,7 @@
 
 If you interact directly with the `tf.Session` API in `python`, you can
 configure the `RunOptions` proto that you call your `Session.run()` method
-with, by using the method @{tfdbg.watch_graph}.
+with, by using the method `tfdbg.watch_graph`.
 This will cause the intermediate tensors and runtime graphs to be dumped to a
 shared storage location of your choice when the `Session.run()` call occurs
 (at the cost of slower performance). For example:
@@ -629,7 +627,7 @@
 
 Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples
 described earlier in this document.
-As the training, evalution or prediction happens with `Estimator`,
+As the training, evaluation or prediction happens with `Estimator`,
 tfdbg creates directories having the following name pattern:
 `/shared/storage/location/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>`.
 Each directory corresponds to a `Session.run()` call that underlies
@@ -715,7 +713,7 @@
 
 *   models with many intermediate tensors
 *   very large intermediate tensors
-*   many @{tf.while_loop} iterations
+*   many `tf.while_loop` iterations
 
 There are three possible workarounds or solutions:
 
@@ -770,12 +768,12 @@
 
 **A**: The reason why you see no data dumped is because every node in the
        executed TensorFlow graph is constant-folded by the TensorFlow runtime.
-       In this exapmle, `a` is a constant tensor; therefore, the fetched
+       In this example, `a` is a constant tensor; therefore, the fetched
        tensor `b` is effectively also a constant tensor. TensorFlow's graph
        optimization folds the graph that contains `a` and `b` into a single
        node to speed up future runs of the graph, which is why `tfdbg` does
        not generate any intermediate tensor dumps. However, if `a` were a
-       @{tf.Variable}, as in the following example:
+       `tf.Variable`, as in the following example:
 
 ``` python
 import numpy as np
diff --git a/tensorflow/docs_src/guide/eager.md b/tensorflow/docs_src/guide/eager.md
index 24f6e4e..3b5797a 100644
--- a/tensorflow/docs_src/guide/eager.md
+++ b/tensorflow/docs_src/guide/eager.md
@@ -558,7 +558,7 @@
 
 #### Summaries and TensorBoard
 
-@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+[TensorBoard](../guide/summaries_and_tensorboard.md) is a visualization tool for
 understanding, debugging and optimizing the model training process. It uses
 summary events that are written while executing the program.
 
@@ -568,9 +568,8 @@
 100 global steps:
 
 ```py
+global_step = tf.train.get_or_create_global_step()
 writer = tf.contrib.summary.create_file_writer(logdir)
-global_step=tf.train.get_or_create_global_step()  # return global step var
-
 writer.set_as_default()
 
 for _ in range(iterations):
diff --git a/tensorflow/docs_src/guide/embedding.md b/tensorflow/docs_src/guide/embedding.md
index 8a98367..6007e68 100644
--- a/tensorflow/docs_src/guide/embedding.md
+++ b/tensorflow/docs_src/guide/embedding.md
@@ -78,7 +78,7 @@
 functions and data sets. For example, one could use a recurrent neural network
 to predict the next word from the previous one given a large corpus of
 sentences, or one could train two networks to do multi-lingual translation.
-These methods are described in the @{$word2vec$Vector Representations of Words}
+These methods are described in the [Vector Representations of Words](../tutorials/representation/word2vec.md)
 tutorial.
 
 ## Visualizing Embeddings
diff --git a/tensorflow/docs_src/guide/estimators.md b/tensorflow/docs_src/guide/estimators.md
index 78b30c3..3903bfd 100644
--- a/tensorflow/docs_src/guide/estimators.md
+++ b/tensorflow/docs_src/guide/estimators.md
@@ -1,6 +1,6 @@
 # Estimators
 
-This document introduces @{tf.estimator$**Estimators**}--a high-level TensorFlow
+This document introduces `tf.estimator`--a high-level TensorFlow
 API that greatly simplifies machine learning programming. Estimators encapsulate
 the following actions:
 
@@ -11,10 +11,13 @@
 
 You may either use the pre-made Estimators we provide or write your
 own custom Estimators.  All Estimators--whether pre-made or custom--are
-classes based on the @{tf.estimator.Estimator} class.
+classes based on the `tf.estimator.Estimator` class.
+
+For a quick example try [Estimator tutorials]](../tutorials/estimators/linear).
+To see each sub-topic in depth, see the [Estimator guides](premade_estimators).
 
 Note: TensorFlow also includes a deprecated `Estimator` class at
-@{tf.contrib.learn.Estimator}, which you should not use.
+`tf.contrib.learn.Estimator`, which you should not use.
 
 
 ## Advantages of Estimators
@@ -29,14 +32,14 @@
 *   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on @{tf.layers}, which
+*   Estimators are themselves built on `tf.keras.layers`, which
     simplifies customization.
 *   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
     *   initialize variables
-    *   start queues
+    *   load data
     *   handle exceptions
     *   create checkpoint files and recover from failures
     *   save summaries for TensorBoard
@@ -52,9 +55,9 @@
 than the base TensorFlow APIs. You no longer have to worry about creating
 the computational graph or sessions since Estimators handle all
 the "plumbing" for you.  That is, pre-made Estimators create and manage
-@{tf.Graph$`Graph`} and @{tf.Session$`Session`} objects for you.  Furthermore,
+`tf.Graph` and `tf.Session` objects for you.  Furthermore,
 pre-made Estimators let you experiment with different model architectures by
-making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
+making only minimal code changes.  `tf.estimator.DNNClassifier`,
 for example, is a pre-made Estimator class that trains classification models
 based on dense, feed-forward neural networks.
 
@@ -81,9 +84,9 @@
            ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
-    (See @{$guide/datasets} for full details.)
+    (See [Importing Data](../guide/datasets.md) for full details.)
 
-2.  **Define the feature columns.** Each @{tf.feature_column}
+2.  **Define the feature columns.** Each `tf.feature_column`
     identifies a feature name, its type, and any input pre-processing.
     For example, the following snippet creates three feature
     columns that hold integer or floating-point data.  The first two
@@ -133,7 +136,7 @@
 evaluation, and prediction. When you are using a pre-made Estimator,
 someone else has already implemented the model function. When relying
 on a custom Estimator, you must write the model function yourself. A
-@{$custom_estimators$companion document}
+[companion document](../guide/custom_estimators.md)
 explains how to write the model function.
 
 
@@ -155,7 +158,7 @@
 
 You can convert existing Keras models to Estimators. Doing so enables your Keras
 model to access Estimator's strengths, such as distributed training. Call
-@{tf.keras.estimator.model_to_estimator} as in the
+`tf.keras.estimator.model_to_estimator` as in the
 following sample:
 
 ```python
@@ -190,4 +193,4 @@
 `keras_inception_v3.output_names`.
 
 For more details, please refer to the documentation for
-@{tf.keras.estimator.model_to_estimator}.
+`tf.keras.estimator.model_to_estimator`.
diff --git a/tensorflow/docs_src/guide/faq.md b/tensorflow/docs_src/guide/faq.md
index b6291a9..a02635e 100644
--- a/tensorflow/docs_src/guide/faq.md
+++ b/tensorflow/docs_src/guide/faq.md
@@ -2,7 +2,7 @@
 
 This document provides answers to some of the frequently asked questions about
 TensorFlow. If you have a question that is not covered here, you might find an
-answer on one of the TensorFlow @{$about$community resources}.
+answer on one of the TensorFlow [community resources](../about/index.md).
 
 [TOC]
 
@@ -11,7 +11,7 @@
 #### Can I run distributed training on multiple computers?
 
 Yes! TensorFlow gained
-@{$distributed$support for distributed computation} in
+[support for distributed computation](../deploy/distributed.md) in
 version 0.8. TensorFlow now supports multiple devices (CPUs and GPUs) in one or
 more computers.
 
@@ -23,18 +23,18 @@
 ## Building a TensorFlow graph
 
 See also the
-@{$python/framework$API documentation on building graphs}.
+[API documentation on building graphs](../api_guides/python/framework.md).
 
 #### Why does `c = tf.matmul(a, b)` not execute the matrix multiplication immediately?
 
 In the TensorFlow Python API, `a`, `b`, and `c` are
-@{tf.Tensor} objects. A `Tensor` object is
+`tf.Tensor` objects. A `Tensor` object is
 a symbolic handle to the result of an operation, but does not actually hold the
 values of the operation's output. Instead, TensorFlow encourages users to build
 up complicated expressions (such as entire neural networks and its gradients) as
 a dataflow graph. You then offload the computation of the entire dataflow graph
 (or a subgraph of it) to a TensorFlow
-@{tf.Session}, which is able to execute the
+`tf.Session`, which is able to execute the
 whole computation much more efficiently than executing the operations
 one-by-one.
 
@@ -46,34 +46,34 @@
 #### How do I place operations on a particular device?
 
 To place a group of operations on a device, create them within a
-@{tf.device$`with tf.device(name):`} context.  See
+`tf.device` context.  See
 the how-to documentation on
-@{$using_gpu$using GPUs with TensorFlow} for details of how
+[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how
 TensorFlow assigns operations to devices, and the
-@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that
 uses multiple GPUs.
 
 
 ## Running a TensorFlow computation
 
 See also the
-@{$python/client$API documentation on running graphs}.
+[API documentation on running graphs](../api_guides/python/client.md).
 
 #### What's the deal with feeding and placeholders?
 
 Feeding is a mechanism in the TensorFlow Session API that allows you to
 substitute different values for one or more tensors at run time. The `feed_dict`
-argument to @{tf.Session.run} is a
-dictionary that maps @{tf.Tensor} objects to
+argument to `tf.Session.run` is a
+dictionary that maps `tf.Tensor` objects to
 numpy arrays (and some other types), which will be used as the values of those
 tensors in the execution of a step.
 
 #### What is the difference between `Session.run()` and `Tensor.eval()`?
 
-If `t` is a @{tf.Tensor} object,
-@{tf.Tensor.eval} is shorthand for
-@{tf.Session.run}, where `sess` is the
-current @{tf.get_default_session}. The
+If `t` is a `tf.Tensor` object,
+`tf.Tensor.eval` is shorthand for
+`tf.Session.run`, where `sess` is the
+current `tf.get_default_session`. The
 two following snippets of code are equivalent:
 
 ```python
@@ -99,14 +99,14 @@
 #### Do Sessions have a lifetime? What about intermediate tensors?
 
 Sessions can own resources, such as
-@{tf.Variable},
-@{tf.QueueBase}, and
-@{tf.ReaderBase}. These resources can sometimes use
+`tf.Variable`,
+`tf.QueueBase`, and
+`tf.ReaderBase`. These resources can sometimes use
 a significant amount of memory, and can be released when the session is closed by calling
-@{tf.Session.close}.
+`tf.Session.close`.
 
 The intermediate tensors that are created as part of a call to
-@{$python/client$`Session.run()`} will be freed at or before the
+[`Session.run()`](../api_guides/python/client.md) will be freed at or before the
 end of the call.
 
 #### Does the runtime parallelize parts of graph execution?
@@ -118,9 +118,9 @@
   CPU, or multiple threads in a GPU.
 * Independent nodes in a TensorFlow graph can run in parallel on multiple
   devices, which makes it possible to speed up
-  @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
+  [CIFAR-10 training using multiple GPUs](../tutorials/images/deep_cnn.md).
 * The Session API allows multiple concurrent steps (i.e. calls to
-  @{tf.Session.run} in parallel). This
+  `tf.Session.run` in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
   all of the resources in your computer.
 
@@ -141,9 +141,9 @@
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
 TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on
-@{$using_gpu$using GPUs with TensorFlow} for details of how
+[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how
 TensorFlow assigns operations to devices, and the
-@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that
 uses multiple GPUs.
 
 Note that TensorFlow only uses GPU devices with a compute capability greater
@@ -151,27 +151,27 @@
 
 #### Why does `Session.run()` hang when using a reader or a queue?
 
-The @{tf.ReaderBase} and
-@{tf.QueueBase} classes provide special operations that
+The `tf.ReaderBase` and
+`tf.QueueBase` classes provide special operations that
 can *block* until input (or free space in a bounded queue) becomes
 available. These operations allow you to build sophisticated
-@{$reading_data$input pipelines}, at the cost of making the
+[input pipelines](../api_guides/python/reading_data.md), at the cost of making the
 TensorFlow computation somewhat more complicated. See the how-to documentation
 for
-@{$reading_data#creating_threads_to_prefetch_using_queuerunner_objects$using `QueueRunner` objects to drive queues and readers}
+[using `QueueRunner` objects to drive queues and readers](../api_guides/python/reading_data.md#creating_threads_to_prefetch_using_queuerunner_objects)
 for more information on how to use them.
 
 ## Variables
 
-See also the how-to documentation on @{$variables$variables} and
-@{$python/state_ops$the API documentation for variables}.
+See also the how-to documentation on [variables](../guide/variables.md) and
+[the API documentation for variables](../api_guides/python/state_ops.md).
 
 #### What is the lifetime of a variable?
 
 A variable is created when you first run the
-@{tf.Variable.initializer}
+`tf.Variable.initializer`
 operation for that variable in a session. It is destroyed when that
-@{tf.Session.close}.
+`tf.Session.close`.
 
 #### How do variables behave when they are concurrently accessed?
 
@@ -179,32 +179,31 @@
 variable may change if it is concurrently updated. By default, concurrent
 assignment operations to a variable are allowed to run with no mutual exclusion.
 To acquire a lock when assigning to a variable, pass `use_locking=True` to
-@{tf.Variable.assign}.
+`tf.Variable.assign`.
 
 ## Tensor shapes
 
 See also the
-@{tf.TensorShape}.
+`tf.TensorShape`.
 
 #### How can I determine the shape of a tensor in Python?
 
 In TensorFlow, a tensor has both a static (inferred) shape and a dynamic (true)
 shape. The static shape can be read using the
-@{tf.Tensor.get_shape}
+`tf.Tensor.get_shape`
 method: this shape is inferred from the operations that were used to create the
-tensor, and may be
-@{tf.TensorShape$partially complete}. If the static
-shape is not fully defined, the dynamic shape of a `Tensor` `t` can be
-determined by evaluating @{tf.shape$`tf.shape(t)`}.
+tensor, and may be partially complete (the static-shape may contain `None`). If
+the static shape is not fully defined, the dynamic shape of a `tf.Tensor`, `t`
+can be determined using `tf.shape(t)`.
 
 #### What is the difference between `x.set_shape()` and `x = tf.reshape(x)`?
 
-The @{tf.Tensor.set_shape} method updates
+The `tf.Tensor.set_shape` method updates
 the static shape of a `Tensor` object, and it is typically used to provide
 additional shape information when this cannot be inferred directly. It does not
 change the dynamic shape of the tensor.
 
-The @{tf.reshape} operation creates
+The `tf.reshape` operation creates
 a new tensor with a different dynamic shape.
 
 #### How do I build a graph that works with variable batch sizes?
@@ -212,9 +211,9 @@
 It is often useful to build a graph that works with variable batch sizes 
 so that the same code can be used for (mini-)batch training, and
 single-instance inference. The resulting graph can be
-@{tf.Graph.as_graph_def$saved as a protocol buffer}
+`tf.Graph.as_graph_def`
 and
-@{tf.import_graph_def$imported into another program}.
+`tf.import_graph_def`.
 
 When building a variable-size graph, the most important thing to remember is not
 to encode the batch size as a Python constant, but instead to use a symbolic
@@ -224,7 +223,7 @@
   to extract the batch dimension from a `Tensor` called `input`, and store it in
   a `Tensor` called `batch_size`.
 
-* Use @{tf.reduce_mean} instead
+* Use `tf.reduce_mean` instead
   of `tf.reduce_sum(...) / batch_size`.
 
 
@@ -232,7 +231,7 @@
 
 #### How can I visualize a TensorFlow graph?
 
-See the @{$graph_viz$graph visualization tutorial}.
+See the [graph visualization tutorial](../guide/graph_viz.md).
 
 #### What is the simplest way to send data to TensorBoard?
 
@@ -242,7 +241,7 @@
     python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
 
 For more details, see the
-@{$summaries_and_tensorboard$Summaries and TensorBoard tutorial}.
+[Summaries and TensorBoard tutorial](../guide/summaries_and_tensorboard.md).
 
 #### Every time I launch TensorBoard, I get a network security popup!
 
@@ -252,30 +251,30 @@
 ## Extending TensorFlow
 
 See the how-to documentation for
-@{$adding_an_op$adding a new operation to TensorFlow}.
+[adding a new operation to TensorFlow](../extend/adding_an_op.md).
 
 #### My data is in a custom format. How do I read it using TensorFlow?
 
 There are three main options for dealing with data in a custom format.
 
 The easiest option is to write parsing code in Python that transforms the data
-into a numpy array. Then, use @{tf.data.Dataset.from_tensor_slices} to
+into a numpy array. Then, use `tf.data.Dataset.from_tensor_slices` to
 create an input pipeline from the in-memory data.
 
 If your data doesn't fit in memory, try doing the parsing in the Dataset
 pipeline. Start with an appropriate file reader, like
-@{tf.data.TextLineDataset}. Then convert the dataset by mapping
-@{tf.data.Dataset.map$mapping} appropriate operations over it.
-Prefer predefined TensorFlow operations such as @{tf.decode_raw},
-@{tf.decode_csv}, @{tf.parse_example}, or @{tf.image.decode_png}.
+`tf.data.TextLineDataset`. Then convert the dataset by mapping
+`tf.data.Dataset.map` appropriate operations over it.
+Prefer predefined TensorFlow operations such as `tf.decode_raw`,
+`tf.decode_csv`, `tf.parse_example`, or `tf.image.decode_png`.
 
 If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
-as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
+as `tf.python_io.TFRecordWriter` format.
 
 The most efficient method to customize the parsing behavior is to
-@{$adding_an_op$add a new op written in C++} that parses your
-data format. The @{$new_data_formats$guide to handling new data formats} has
+[add a new op written in C++](../extend/adding_an_op.md) that parses your
+data format. The [guide to handling new data formats](../extend/new_data_formats.md) has
 more information about the steps for doing this.
 
 
diff --git a/tensorflow/docs_src/guide/feature_columns.md b/tensorflow/docs_src/guide/feature_columns.md
index 41080e0..3ad4185 100644
--- a/tensorflow/docs_src/guide/feature_columns.md
+++ b/tensorflow/docs_src/guide/feature_columns.md
@@ -5,11 +5,11 @@
 enabling you to transform a diverse range of raw data into formats that
 Estimators can use, allowing easy experimentation.
 
-In @{$premade_estimators$Premade Estimators}, we used the premade
-Estimator, @{tf.estimator.DNNClassifier$`DNNClassifier`} to train a model to
+In [Premade Estimators](../guide/premade_estimators.md), we used the premade
+Estimator, `tf.estimator.DNNClassifier` to train a model to
 predict different types of Iris flowers from four input features. That example
 created only numerical feature columns (of type
-@{tf.feature_column.numeric_column}). Although numerical feature columns model
+`tf.feature_column.numeric_column`). Although numerical feature columns model
 the lengths of petals and sepals effectively, real world data sets contain all
 kinds of features, many of which are non-numerical.
 
@@ -59,7 +59,7 @@
 </div>
 
 To create feature columns, call functions from the
-@{tf.feature_column} module. This document explains nine of the functions in
+`tf.feature_column` module. This document explains nine of the functions in
 that module. As the following figure shows, all nine functions return either a
 Categorical-Column or a Dense-Column object, except `bucketized_column`, which
 inherits from both classes:
@@ -75,7 +75,7 @@
 
 ### Numeric column
 
-The Iris classifier calls the @{tf.feature_column.numeric_column} function for
+The Iris classifier calls the `tf.feature_column.numeric_column` function for
 all input features:
 
   * `SepalLength`
@@ -119,7 +119,7 @@
 
 Often, you don't want to feed a number directly into the model, but instead
 split its value into different categories based on numerical ranges.  To do so,
-create a @{tf.feature_column.bucketized_column$bucketized column}. For
+create a `tf.feature_column.bucketized_column`. For
 example, consider raw data that represents the year a house was built. Instead
 of representing that year as a scalar numeric column, we could split the year
 into the following four buckets:
@@ -194,7 +194,7 @@
 * `1="electronics"`
 * `2="sport"`
 
-Call @{tf.feature_column.categorical_column_with_identity} to implement a
+Call `tf.feature_column.categorical_column_with_identity` to implement a
 categorical identity column. For example:
 
 ``` python
@@ -230,8 +230,8 @@
 categorical identity columns. TensorFlow provides two different functions to
 create categorical vocabulary columns:
 
-* @{tf.feature_column.categorical_column_with_vocabulary_list}
-* @{tf.feature_column.categorical_column_with_vocabulary_file}
+* `tf.feature_column.categorical_column_with_vocabulary_list`
+* `tf.feature_column.categorical_column_with_vocabulary_file`
 
 `categorical_column_with_vocabulary_list` maps each string to an integer based
 on an explicit vocabulary list. For example:
@@ -281,7 +281,7 @@
 for each vocabulary word or integer because that would consume too much memory.
 For these cases, we can instead turn the question around and ask, "How many
 categories am I willing to have for my input?"  In fact, the
-@{tf.feature_column.categorical_column_with_hash_bucket} function enables you
+`tf.feature_column.categorical_column_with_hash_bucket` function enables you
 to specify the number of categories. For this type of feature column the model
 calculates a hash value of the input, then puts it into one of
 the `hash_bucket_size` categories using the modulo operator, as in the following
@@ -289,7 +289,7 @@
 
 ```python
 # pseudocode
-feature_id = hash(raw_feature) % hash_buckets_size
+feature_id = hash(raw_feature) % hash_bucket_size
 ```
 
 The code to create the `feature_column` might look something like this:
@@ -298,7 +298,7 @@
 hashed_feature_column =
     tf.feature_column.categorical_column_with_hash_bucket(
         key = "some_feature",
-        hash_buckets_size = 100) # The number of categories
+        hash_bucket_size = 100) # The number of categories
 ```
 At this point, you might rightfully think: "This is crazy!" After all, we are
 forcing the different input values to a smaller set of categories. This means
@@ -349,7 +349,7 @@
 </div>
 
 For the solution, we used a combination of the `bucketized_column` we looked at
-earlier, with the @{tf.feature_column.crossed_column} function.
+earlier, with the `tf.feature_column.crossed_column` function.
 
 <!--TODO(markdaoust) link to full example-->
 
@@ -440,7 +440,7 @@
 </div>
 
 Here's how you create an indicator column by calling
-@{tf.feature_column.indicator_column}:
+`tf.feature_column.indicator_column`:
 
 ``` python
 categorical_column = ... # Create any type of categorical column.
@@ -521,7 +521,7 @@
 Note that this is just a general guideline; you can set the number of embedding
 dimensions as you please.
 
-Call @{tf.feature_column.embedding_column} to create an `embedding_column` as
+Call `tf.feature_column.embedding_column` to create an `embedding_column` as
 suggested by the following snippet:
 
 ``` python
@@ -534,7 +534,7 @@
     dimension=embedding_dimensions)
 ```
 
-@{$guide/embedding$Embeddings} is a significant topic within machine
+[Embeddings](../guide/embedding.md) is a significant topic within machine
 learning. This information was just to get you started using them as feature
 columns.
 
@@ -543,15 +543,15 @@
 As the following list indicates, not all Estimators permit all types of
 `feature_columns` argument(s):
 
-* @{tf.estimator.LinearClassifier$`LinearClassifier`} and
-  @{tf.estimator.LinearRegressor$`LinearRegressor`}: Accept all types of
+* `tf.estimator.LinearClassifier` and
+  `tf.estimator.LinearRegressor`: Accept all types of
   feature column.
-* @{tf.estimator.DNNClassifier$`DNNClassifier`} and
-  @{tf.estimator.DNNRegressor$`DNNRegressor`}: Only accept dense columns. Other
+* `tf.estimator.DNNClassifier` and
+  `tf.estimator.DNNRegressor`: Only accept dense columns. Other
   column types must be wrapped in either an `indicator_column` or
   `embedding_column`.
-* @{tf.estimator.DNNLinearCombinedClassifier$`DNNLinearCombinedClassifier`} and
-  @{tf.estimator.DNNLinearCombinedRegressor$`DNNLinearCombinedRegressor`}:
+* `tf.estimator.DNNLinearCombinedClassifier` and
+  `tf.estimator.DNNLinearCombinedRegressor`:
     * The `linear_feature_columns` argument accepts any feature column type.
     * The `dnn_feature_columns` argument only accepts dense columns.
 
@@ -559,7 +559,7 @@
 
 For more examples on feature columns, view the following:
 
-* The @{$low_level_intro#feature_columns$Low Level Introduction} demonstrates how
+* The [Low Level Introduction](../guide/low_level_intro.md#feature_columns) demonstrates how
   experiment directly with `feature_columns` using TensorFlow's low level APIs.
 * The [Estimator wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep)
   solves a binary classification problem using `feature_columns` on a variety of
diff --git a/tensorflow/docs_src/guide/graph_viz.md b/tensorflow/docs_src/guide/graph_viz.md
index a8876da..23f722b 100644
--- a/tensorflow/docs_src/guide/graph_viz.md
+++ b/tensorflow/docs_src/guide/graph_viz.md
@@ -5,7 +5,7 @@
 ![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
 *Visualization of a TensorFlow graph.*
 
-To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
+To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md).
 
 ## Name scoping and nodes
 
@@ -15,7 +15,7 @@
 define a hierarchy on the nodes in the graph.  By default, only the top of this
 hierarchy is shown. Here is an example that defines three operations under the
 `hidden` name scope using
-@{tf.name_scope}:
+`tf.name_scope`:
 
 ```python
 import tensorflow as tf
@@ -251,7 +251,7 @@
 [Estimators MNIST tutorial](../tutorials/estimators/cnn.md), in which we have
 recorded summaries and
 runtime statistics. See the
-@{$summaries_and_tensorboard#serializing-the-data$Summaries Tutorial}
+[Summaries Tutorial](../guide/summaries_and_tensorboard.md#serializing-the-data)
 for details on how to record summaries.
 Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
 
diff --git a/tensorflow/docs_src/guide/graphs.md b/tensorflow/docs_src/guide/graphs.md
index 492f97c..c70479d 100644
--- a/tensorflow/docs_src/guide/graphs.md
+++ b/tensorflow/docs_src/guide/graphs.md
@@ -7,7 +7,7 @@
 remote devices.
 
 This guide will be most useful if you intend to use the low-level programming
-model directly. Higher-level APIs such as @{tf.estimator.Estimator} and Keras
+model directly. Higher-level APIs such as `tf.estimator.Estimator` and Keras
 hide the details of graphs and sessions from the end user, but this guide may
 also be useful if you want to understand how these APIs are implemented.
 
@@ -18,12 +18,12 @@
 [Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) is a common
 programming model for parallel computing. In a dataflow graph, the nodes
 represent units of computation, and the edges represent the data consumed or
-produced by a computation. For example, in a TensorFlow graph, the @{tf.matmul}
+produced by a computation. For example, in a TensorFlow graph, the `tf.matmul`
 operation would correspond to a single node with two incoming edges (the
 matrices to be multiplied) and one outgoing edge (the result of the
 multiplication).
 
-<!-- TODO(barryr): Add a diagram to illustrate the @{tf.matmul} graph. -->
+<!-- TODO(barryr): Add a diagram to illustrate the `tf.matmul` graph. -->
 
 Dataflow has several advantages that TensorFlow leverages when executing your
 programs:
@@ -38,19 +38,19 @@
   machines. TensorFlow inserts the necessary communication and coordination
   between devices.
 
-* **Compilation.** TensorFlow's @{$performance/xla$XLA compiler} can
+* **Compilation.** TensorFlow's [XLA compiler](../performance/xla/index.md) can
   use the information in your dataflow graph to generate faster code, for
   example, by fusing together adjacent operations.
 
 * **Portability.** The dataflow graph is a language-independent representation
   of the code in your model. You can build a dataflow graph in Python, store it
-  in a @{$saved_model$SavedModel}, and restore it in a C++ program for
+  in a [SavedModel](../guide/saved_model.md), and restore it in a C++ program for
   low-latency inference.
 
 
-## What is a @{tf.Graph}?
+## What is a `tf.Graph`?
 
-A @{tf.Graph} contains two relevant kinds of information:
+A `tf.Graph` contains two relevant kinds of information:
 
 * **Graph structure.** The nodes and edges of the graph, indicating how
   individual operations are composed together, but not prescribing how they
@@ -59,78 +59,78 @@
   context that source code conveys.
 
 * **Graph collections.** TensorFlow provides a general mechanism for storing
-  collections of metadata in a @{tf.Graph}. The @{tf.add_to_collection} function
-  enables you to associate a list of objects with a key (where @{tf.GraphKeys}
-  defines some of the standard keys), and @{tf.get_collection} enables you to
+  collections of metadata in a `tf.Graph`. The `tf.add_to_collection` function
+  enables you to associate a list of objects with a key (where `tf.GraphKeys`
+  defines some of the standard keys), and `tf.get_collection` enables you to
   look up all objects associated with a key. Many parts of the TensorFlow
-  library use this facility: for example, when you create a @{tf.Variable}, it
+  library use this facility: for example, when you create a `tf.Variable`, it
   is added by default to collections representing "global variables" and
-  "trainable variables". When you later come to create a @{tf.train.Saver} or
-  @{tf.train.Optimizer}, the variables in these collections are used as the
+  "trainable variables". When you later come to create a `tf.train.Saver` or
+  `tf.train.Optimizer`, the variables in these collections are used as the
   default arguments.
 
 
-## Building a @{tf.Graph}
+## Building a `tf.Graph`
 
 Most TensorFlow programs start with a dataflow graph construction phase. In this
-phase, you invoke TensorFlow API functions that construct new @{tf.Operation}
-(node) and @{tf.Tensor} (edge) objects and add them to a @{tf.Graph}
+phase, you invoke TensorFlow API functions that construct new `tf.Operation`
+(node) and `tf.Tensor` (edge) objects and add them to a `tf.Graph`
 instance. TensorFlow provides a **default graph** that is an implicit argument
 to all API functions in the same context.  For example:
 
-* Calling `tf.constant(42.0)` creates a single @{tf.Operation} that produces the
-  value `42.0`, adds it to the default graph, and returns a @{tf.Tensor} that
+* Calling `tf.constant(42.0)` creates a single `tf.Operation` that produces the
+  value `42.0`, adds it to the default graph, and returns a `tf.Tensor` that
   represents the value of the constant.
 
-* Calling `tf.matmul(x, y)` creates a single @{tf.Operation} that multiplies
-  the values of @{tf.Tensor} objects `x` and `y`, adds it to the default graph,
-  and returns a @{tf.Tensor} that represents the result of the multiplication.
+* Calling `tf.matmul(x, y)` creates a single `tf.Operation` that multiplies
+  the values of `tf.Tensor` objects `x` and `y`, adds it to the default graph,
+  and returns a `tf.Tensor` that represents the result of the multiplication.
 
-* Executing `v = tf.Variable(0)` adds to the graph a @{tf.Operation} that will
-  store a writeable tensor value that persists between @{tf.Session.run} calls.
-  The @{tf.Variable} object wraps this operation, and can be used [like a
+* Executing `v = tf.Variable(0)` adds to the graph a `tf.Operation` that will
+  store a writeable tensor value that persists between `tf.Session.run` calls.
+  The `tf.Variable` object wraps this operation, and can be used [like a
   tensor](#tensor-like_objects), which will read the current value of the
-  stored value. The @{tf.Variable} object also has methods such as
-  @{tf.Variable.assign$`assign`} and @{tf.Variable.assign_add$`assign_add`} that
-  create @{tf.Operation} objects that, when executed, update the stored value.
-  (See @{$guide/variables} for more information about variables.)
+  stored value. The `tf.Variable` object also has methods such as
+  `tf.Variable.assign` and `tf.Variable.assign_add` that
+  create `tf.Operation` objects that, when executed, update the stored value.
+  (See [Variables](../guide/variables.md) for more information about variables.)
 
-* Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
-  default graph that calculates gradients, and return a @{tf.Operation} that,
+* Calling `tf.train.Optimizer.minimize` will add operations and tensors to the
+  default graph that calculates gradients, and return a `tf.Operation` that,
   when run, will apply those gradients to a set of variables.
 
 Most programs rely solely on the default graph. However,
 see [Dealing with multiple graphs](#programming_with_multiple_graphs) for more
-advanced use cases. High-level APIs such as the @{tf.estimator.Estimator} API
+advanced use cases. High-level APIs such as the `tf.estimator.Estimator` API
 manage the default graph on your behalf, and--for example--may create different
 graphs for training and evaluation.
 
 Note: Calling most functions in the TensorFlow API merely adds operations
 and tensors to the default graph, but **does not** perform the actual
-computation. Instead, you compose these functions until you have a @{tf.Tensor}
-or @{tf.Operation} that represents the overall computation--such as performing
-one step of gradient descent--and then pass that object to a @{tf.Session} to
-perform the computation. See the section "Executing a graph in a @{tf.Session}"
+computation. Instead, you compose these functions until you have a `tf.Tensor`
+or `tf.Operation` that represents the overall computation--such as performing
+one step of gradient descent--and then pass that object to a `tf.Session` to
+perform the computation. See the section "Executing a graph in a `tf.Session`"
 for more details.
 
 ## Naming operations
 
-A @{tf.Graph} object defines a **namespace** for the @{tf.Operation} objects it
+A `tf.Graph` object defines a **namespace** for the `tf.Operation` objects it
 contains. TensorFlow automatically chooses a unique name for each operation in
 your graph, but giving operations descriptive names can make your program easier
 to read and debug. The TensorFlow API provides two ways to override the name of
 an operation:
 
-* Each API function that creates a new @{tf.Operation} or returns a new
-  @{tf.Tensor} accepts an optional `name` argument. For example,
-  `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
-  `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
+* Each API function that creates a new `tf.Operation` or returns a new
+  `tf.Tensor` accepts an optional `name` argument. For example,
+  `tf.constant(42.0, name="answer")` creates a new `tf.Operation` named
+  `"answer"` and returns a `tf.Tensor` named `"answer:0"`. If the default graph
   already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
-* The @{tf.name_scope} function makes it possible to add a **name scope** prefix
+* The `tf.name_scope` function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
-  prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
+  prefix is a `"/"`-delimited list of the names of all active `tf.name_scope`
   context managers. If a name scope has already been used in the current
   context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
@@ -160,7 +160,7 @@
 complexity of a graph. See [Visualizing your graph](#visualizing-your-graph) for
 more information.
 
-Note that @{tf.Tensor} objects are implicitly named after the @{tf.Operation}
+Note that `tf.Tensor` objects are implicitly named after the `tf.Operation`
 that produces the tensor as output. A tensor name has the form `"<OP_NAME>:<i>"`
 where:
 
@@ -171,7 +171,7 @@
 ## Placing operations on different devices
 
 If you want your TensorFlow program to use multiple different devices, the
-@{tf.device} function provides a convenient way to request that all operations
+`tf.device` function provides a convenient way to request that all operations
 created in a particular context are placed on the same device (or type of
 device).
 
@@ -186,7 +186,7 @@
 * `<JOB_NAME>` is an alpha-numeric string that does not start with a number.
 * `<DEVICE_TYPE>` is a registered device type (such as `GPU` or `CPU`).
 * `<TASK_INDEX>` is a non-negative integer representing the index of the task
-  in the job named `<JOB_NAME>`. See @{tf.train.ClusterSpec} for an explanation
+  in the job named `<JOB_NAME>`. See `tf.train.ClusterSpec` for an explanation
   of jobs and tasks.
 * `<DEVICE_INDEX>` is a non-negative integer representing the index of the
   device, for example, to distinguish between different GPU devices used in the
@@ -194,7 +194,7 @@
 
 You do not need to specify every part of a device specification. For example,
 if you are running in a single-machine configuration with a single GPU, you
-might use @{tf.device} to pin some operations to the CPU and GPU:
+might use `tf.device` to pin some operations to the CPU and GPU:
 
 ```python
 # Operations created outside either context will run on the "best possible"
@@ -210,7 +210,7 @@
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-If you are deploying TensorFlow in a @{$distributed$typical distributed configuration},
+If you are deploying TensorFlow in a [typical distributed configuration](../deploy/distributed.md),
 you might specify the job name and task ID to place variables on
 a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
@@ -229,13 +229,13 @@
   layer_2 = tf.matmul(train_batch, weights_2) + biases_2
 ```
 
-@{tf.device} gives you a lot of flexibility to choose placements for individual
+`tf.device` gives you a lot of flexibility to choose placements for individual
 operations or broad regions of a TensorFlow graph. In many cases, there are
 simple heuristics that work well. For example, the
-@{tf.train.replica_device_setter} API can be used with @{tf.device} to place
+`tf.train.replica_device_setter` API can be used with `tf.device` to place
 operations for **data-parallel distributed training**. For example, the
-following code fragment shows how @{tf.train.replica_device_setter} applies
-different placement policies to @{tf.Variable} objects and other operations:
+following code fragment shows how `tf.train.replica_device_setter` applies
+different placement policies to `tf.Variable` objects and other operations:
 
 ```python
 with tf.device(tf.train.replica_device_setter(ps_tasks=3)):
@@ -253,41 +253,41 @@
 
 ## Tensor-like objects
 
-Many TensorFlow operations take one or more @{tf.Tensor} objects as arguments.
-For example, @{tf.matmul} takes two @{tf.Tensor} objects, and @{tf.add_n} takes
-a list of `n` @{tf.Tensor} objects. For convenience, these functions will accept
-a **tensor-like object** in place of a @{tf.Tensor}, and implicitly convert it
-to a @{tf.Tensor} using the @{tf.convert_to_tensor} method. Tensor-like objects
+Many TensorFlow operations take one or more `tf.Tensor` objects as arguments.
+For example, `tf.matmul` takes two `tf.Tensor` objects, and `tf.add_n` takes
+a list of `n` `tf.Tensor` objects. For convenience, these functions will accept
+a **tensor-like object** in place of a `tf.Tensor`, and implicitly convert it
+to a `tf.Tensor` using the `tf.convert_to_tensor` method. Tensor-like objects
 include elements of the following types:
 
-* @{tf.Tensor}
-* @{tf.Variable}
+* `tf.Tensor`
+* `tf.Variable`
 * [`numpy.ndarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html)
 * `list` (and lists of tensor-like objects)
 * Scalar Python types: `bool`, `float`, `int`, `str`
 
 You can register additional tensor-like types using
-@{tf.register_tensor_conversion_function}.
+`tf.register_tensor_conversion_function`.
 
-Note: By default, TensorFlow will create a new @{tf.Tensor} each time you use
+Note: By default, TensorFlow will create a new `tf.Tensor` each time you use
 the same tensor-like object. If the tensor-like object is large (e.g. a
 `numpy.ndarray` containing a set of training examples) and you use it multiple
 times, you may run out of memory. To avoid this, manually call
-@{tf.convert_to_tensor} on the tensor-like object once and use the returned
-@{tf.Tensor} instead.
+`tf.convert_to_tensor` on the tensor-like object once and use the returned
+`tf.Tensor` instead.
 
-## Executing a graph in a @{tf.Session}
+## Executing a graph in a `tf.Session`
 
-TensorFlow uses the @{tf.Session} class to represent a connection between the
+TensorFlow uses the `tf.Session` class to represent a connection between the
 client program---typically a Python program, although a similar interface is
-available in other languages---and the C++ runtime. A @{tf.Session} object
+available in other languages---and the C++ runtime. A `tf.Session` object
 provides access to devices in the local machine, and remote devices using the
 distributed TensorFlow runtime. It also caches information about your
-@{tf.Graph} so that you can efficiently run the same computation multiple times.
+`tf.Graph` so that you can efficiently run the same computation multiple times.
 
-### Creating a @{tf.Session}
+### Creating a `tf.Session`
 
-If you are using the low-level TensorFlow API, you can create a @{tf.Session}
+If you are using the low-level TensorFlow API, you can create a `tf.Session`
 for the current default graph as follows:
 
 ```python
@@ -300,50 +300,50 @@
   # ...
 ```
 
-Since a @{tf.Session} owns physical resources (such as GPUs and
+Since a `tf.Session` owns physical resources (such as GPUs and
 network connections), it is typically used as a context manager (in a `with`
 block) that automatically closes the session when you exit the block. It is
 also possible to create a session without using a `with` block, but you should
-explicitly call @{tf.Session.close} when you are finished with it to free the
+explicitly call `tf.Session.close` when you are finished with it to free the
 resources.
 
-Note: Higher-level APIs such as @{tf.train.MonitoredTrainingSession} or
-@{tf.estimator.Estimator} will create and manage a @{tf.Session} for you. These
+Note: Higher-level APIs such as `tf.train.MonitoredTrainingSession` or
+`tf.estimator.Estimator` will create and manage a `tf.Session` for you. These
 APIs accept optional `target` and `config` arguments (either directly, or as
-part of a @{tf.estimator.RunConfig} object), with the same meaning as
+part of a `tf.estimator.RunConfig` object), with the same meaning as
 described below.
 
-@{tf.Session.__init__} accepts three optional arguments:
+`tf.Session.__init__` accepts three optional arguments:
 
 * **`target`.** If this argument is left empty (the default), the session will
   only use devices in the local machine. However, you may also specify a
   `grpc://` URL to specify the address of a TensorFlow server, which gives the
   session access to all devices on machines that this server controls. See
-  @{tf.train.Server} for details of how to create a TensorFlow
+  `tf.train.Server` for details of how to create a TensorFlow
   server. For example, in the common **between-graph replication**
-  configuration, the @{tf.Session} connects to a @{tf.train.Server} in the same
+  configuration, the `tf.Session` connects to a `tf.train.Server` in the same
   process as the client. The [distributed TensorFlow](../deploy/distributed.md)
   deployment guide describes other common scenarios.
 
-* **`graph`.** By default, a new @{tf.Session} will be bound to---and only able
+* **`graph`.** By default, a new `tf.Session` will be bound to---and only able
   to run operations in---the current default graph. If you are using multiple
   graphs in your program (see [Programming with multiple
   graphs](#programming_with_multiple_graphs) for more details), you can specify
-  an explicit @{tf.Graph} when you construct the session.
+  an explicit `tf.Graph` when you construct the session.
 
-* **`config`.** This argument allows you to specify a @{tf.ConfigProto} that
+* **`config`.** This argument allows you to specify a `tf.ConfigProto` that
   controls the behavior of the session. For example, some of the configuration
   options include:
 
     * `allow_soft_placement`. Set this to `True` to enable a "soft" device
-    placement algorithm, which ignores @{tf.device} annotations that attempt
+    placement algorithm, which ignores `tf.device` annotations that attempt
     to place CPU-only operations on a GPU device, and places them on the CPU
     instead.
 
     * `cluster_def`. When using distributed TensorFlow, this option allows you
     to specify what machines to use in the computation, and provide a mapping
     between job names, task indices, and network addresses. See
-    @{tf.train.ClusterSpec.as_cluster_def} for details.
+    `tf.train.ClusterSpec.as_cluster_def` for details.
 
     * `graph_options.optimizer_options`. Provides control over the optimizations
     that TensorFlow performs on your graph before executing it.
@@ -353,21 +353,21 @@
     rather than allocating most of the memory at startup.
 
 
-### Using @{tf.Session.run} to execute operations
+### Using `tf.Session.run` to execute operations
 
-The @{tf.Session.run} method is the main mechanism for running a @{tf.Operation}
-or evaluating a @{tf.Tensor}. You can pass one or more @{tf.Operation} or
-@{tf.Tensor} objects to @{tf.Session.run}, and TensorFlow will execute the
+The `tf.Session.run` method is the main mechanism for running a `tf.Operation`
+or evaluating a `tf.Tensor`. You can pass one or more `tf.Operation` or
+`tf.Tensor` objects to `tf.Session.run`, and TensorFlow will execute the
 operations that are needed to compute the result.
 
-@{tf.Session.run} requires you to specify a list of **fetches**, which determine
-the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or
-a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches
-determine what **subgraph** of the overall @{tf.Graph} must be executed to
+`tf.Session.run` requires you to specify a list of **fetches**, which determine
+the return values, and may be a `tf.Operation`, a `tf.Tensor`, or
+a [tensor-like type](#tensor-like_objects) such as `tf.Variable`. These fetches
+determine what **subgraph** of the overall `tf.Graph` must be executed to
 produce the result: this is the subgraph that contains all operations named in
 the fetch list, plus all operations whose outputs are used to compute the value
 of the fetches. For example, the following code fragment shows how different
-arguments to @{tf.Session.run} cause different subgraphs to be executed:
+arguments to `tf.Session.run` cause different subgraphs to be executed:
 
 ```python
 x = tf.constant([[37.0, -23.0], [1.0, 4.0]])
@@ -390,8 +390,8 @@
   y_val, output_val = sess.run([y, output])
 ```
 
-@{tf.Session.run} also optionally takes a dictionary of **feeds**, which is a
-mapping from @{tf.Tensor} objects (typically @{tf.placeholder} tensors) to
+`tf.Session.run` also optionally takes a dictionary of **feeds**, which is a
+mapping from `tf.Tensor` objects (typically `tf.placeholder` tensors) to
 values (typically Python scalars, lists, or NumPy arrays) that will be
 substituted for those tensors in the execution. For example:
 
@@ -415,7 +415,7 @@
   sess.run(y, {x: 37.0})
 ```
 
-@{tf.Session.run} also accepts an optional `options` argument that enables you
+`tf.Session.run` also accepts an optional `options` argument that enables you
 to specify options about the call, and an optional `run_metadata` argument that
 enables you to collect metadata about the execution. For example, you can use
 these options together to collect tracing information about the execution:
@@ -447,8 +447,8 @@
 TensorFlow includes tools that can help you to understand the code in a graph.
 The **graph visualizer** is a component of TensorBoard that renders the
 structure of your graph visually in a browser. The easiest way to create a
-visualization is to pass a @{tf.Graph} when creating the
-@{tf.summary.FileWriter}:
+visualization is to pass a `tf.Graph` when creating the
+`tf.summary.FileWriter`:
 
 ```python
 # Build your graph.
@@ -471,7 +471,7 @@
   writer.close()
 ```
 
-Note: If you are using a @{tf.estimator.Estimator}, the graph (and any
+Note: If you are using a `tf.estimator.Estimator`, the graph (and any
 summaries) will be logged automatically to the `model_dir` that you specified
 when creating the estimator.
 
@@ -495,8 +495,8 @@
 inference with a trained model. In many cases, the inference graph will be
 different from the training graph: for example, techniques like dropout and
 batch normalization use different operations in each case. Furthermore, by
-default utilities like @{tf.train.Saver} use the names of @{tf.Variable} objects
-(which have names based on an underlying @{tf.Operation}) to identify each
+default utilities like `tf.train.Saver` use the names of `tf.Variable` objects
+(which have names based on an underlying `tf.Operation`) to identify each
 variable in a saved checkpoint. When programming this way, you can either use
 completely separate Python processes to build and execute the graphs, or you can
 use multiple graphs in the same process. This section describes how to use
@@ -507,21 +507,21 @@
 is sufficient. However, TensorFlow also provides methods for manipulating
 the default graph, which can be useful in more advanced use cases. For example:
 
-* A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
+* A `tf.Graph` defines the namespace for `tf.Operation` objects: each
   operation in a single graph must have a unique name. TensorFlow will
   "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to
   their names if the requested name is already taken. Using multiple explicitly
   created graphs gives you more control over what name is given to each
   operation.
 
-* The default graph stores information about every @{tf.Operation} and
-  @{tf.Tensor} that was ever added to it. If your program creates a large number
+* The default graph stores information about every `tf.Operation` and
+  `tf.Tensor` that was ever added to it. If your program creates a large number
   of unconnected subgraphs, it may be more efficient to use a different
-  @{tf.Graph} to build each subgraph, so that unrelated state can be garbage
+  `tf.Graph` to build each subgraph, so that unrelated state can be garbage
   collected.
 
-You can install a different @{tf.Graph} as the default graph, using the
-@{tf.Graph.as_default} context manager:
+You can install a different `tf.Graph` as the default graph, using the
+`tf.Graph.as_default` context manager:
 
 ```python
 g_1 = tf.Graph()
@@ -548,8 +548,8 @@
 assert sess_2.graph is g_2
 ```
 
-To inspect the current default graph, call @{tf.get_default_graph}, which
-returns a @{tf.Graph} object:
+To inspect the current default graph, call `tf.get_default_graph`, which
+returns a `tf.Graph` object:
 
 ```python
 # Print all of the operations in the default graph.
diff --git a/tensorflow/docs_src/guide/index.md b/tensorflow/docs_src/guide/index.md
index f78dfc9..5049958 100644
--- a/tensorflow/docs_src/guide/index.md
+++ b/tensorflow/docs_src/guide/index.md
@@ -5,39 +5,38 @@
 
 ## High Level APIs
 
-  * @{$guide/keras}, TensorFlow's high-level API for building and
+  * [Keras](../guide/keras.md), TensorFlow's high-level API for building and
     training deep learning models.
-  * @{$guide/eager}, an API for writing TensorFlow code
+  * [Eager Execution](../guide/eager.md), an API for writing TensorFlow code
     imperatively, like you would use Numpy.
-  * @{$guide/estimators}, a high-level API that provides
-    fully-packaged models ready for large-scale training and production.
-  * @{$guide/datasets}, easy input pipelines to bring your data into
+  * [Importing Data](../guide/datasets.md), easy input pipelines to bring your data into
     your TensorFlow program.
+  * [Estimators](../guide/estimators.md), a high-level API that provides
+    fully-packaged models ready for large-scale training and production.
 
 ## Estimators
 
-* @{$estimators}, learn how to use Estimators for machine learning.
-* @{$premade_estimators}, the basics of premade Estimators.
-* @{$checkpoints}, save training progress and resume where you left off.
-* @{$feature_columns}, handle a variety of input data types without changes to the model.
-* @{$datasets_for_estimators}, use `tf.data` to input data.
-* @{$custom_estimators}, write your own Estimator.
+* [Premade Estimators](../guide/premade_estimators.md), the basics of premade Estimators.
+* [Checkpoints](../guide/checkpoints.md), save training progress and resume where you left off.
+* [Feature Columns](../guide/feature_columns.md), handle a variety of input data types without changes to the model.
+* [Datasets for Estimators](../guide/datasets_for_estimators.md), use `tf.data` to input data.
+* [Creating Custom Estimators](../guide/custom_estimators.md), write your own Estimator.
 
 ## Accelerators
 
-  * @{$using_gpu} explains how TensorFlow assigns operations to
+  * [Using GPUs](../guide/using_gpu.md) explains how TensorFlow assigns operations to
     devices and how you can change the arrangement manually.
-  * @{$using_tpu} explains how to modify `Estimator` programs to run on a TPU.
+  * [Using TPUs](../guide/using_tpu.md) explains how to modify `Estimator` programs to run on a TPU.
 
 ## Low Level APIs
 
-  * @{$guide/low_level_intro}, which introduces the
+  * [Introduction](../guide/low_level_intro.md), which introduces the
     basics of how you can use TensorFlow outside of the high Level APIs.
-  * @{$guide/tensors}, which explains how to create,
+  * [Tensors](../guide/tensors.md), which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
-  * @{$guide/variables}, which details how
+  * [Variables](../guide/variables.md), which details how
     to represent shared, persistent state in your program.
-  * @{$guide/graphs}, which explains:
+  * [Graphs and Sessions](../guide/graphs.md), which explains:
       * dataflow graphs, which are TensorFlow's representation of computations
         as dependencies between operations.
       * sessions, which are TensorFlow's mechanism for running dataflow graphs
@@ -47,19 +46,19 @@
     such as Estimators or Keras, the high-level API creates and manages
     graphs and sessions for you, but understanding graphs and sessions
     can still be helpful.
-  * @{$guide/saved_model}, which
+  * [Save and Restore](../guide/saved_model.md), which
     explains how to save and restore variables and models.
 
 ## ML Concepts
 
-  * @{$guide/embedding}, which introduces the concept
+  * [Embeddings](../guide/embedding.md), which introduces the concept
     of embeddings, provides a simple example of training an embedding in
     TensorFlow, and explains how to view embeddings with the TensorBoard
     Embedding Projector.
 
 ## Debugging
 
-  * @{$guide/debugger}, which
+  * [TensorFlow Debugger](../guide/debugger.md), which
     explains how to use the TensorFlow debugger (tfdbg).
 
 ## TensorBoard
@@ -67,17 +66,17 @@
 TensorBoard is a utility to visualize different aspects of machine learning.
 The following guides explain how to use TensorBoard:
 
-  * @{$guide/summaries_and_tensorboard},
+  * [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md),
     which introduces TensorBoard.
-  * @{$guide/graph_viz}, which
+  * [TensorBoard: Graph Visualization](../guide/graph_viz.md), which
     explains how to visualize the computational graph.
-  * @{$guide/tensorboard_histograms} which demonstrates the how to
+  * [TensorBoard Histogram Dashboard](../guide/tensorboard_histograms.md) which demonstrates the how to
     use TensorBoard's histogram dashboard.
 
 
 ## Misc
 
-  * @{$guide/version_compat},
+  * [TensorFlow Version Compatibility](../guide/version_compat.md),
     which explains backward compatibility guarantees and non-guarantees.
-  * @{$guide/faq}, which contains frequently asked
+  * [Frequently Asked Questions](../guide/faq.md), which contains frequently asked
     questions about TensorFlow.
diff --git a/tensorflow/docs_src/guide/leftnav_files b/tensorflow/docs_src/guide/leftnav_files
index c4e235b..8e227e0 100644
--- a/tensorflow/docs_src/guide/leftnav_files
+++ b/tensorflow/docs_src/guide/leftnav_files
@@ -4,9 +4,9 @@
 keras.md
 eager.md
 datasets.md
+estimators.md: Introduction to Estimators
 
 ### Estimators
-estimators.md: Introduction to Estimators
 premade_estimators.md
 checkpoints.md
 feature_columns.md
diff --git a/tensorflow/docs_src/guide/low_level_intro.md b/tensorflow/docs_src/guide/low_level_intro.md
index 665a556..d002f8a 100644
--- a/tensorflow/docs_src/guide/low_level_intro.md
+++ b/tensorflow/docs_src/guide/low_level_intro.md
@@ -9,7 +9,7 @@
   * Use high level components ([datasets](#datasets), [layers](#layers), and
     [feature_columns](#feature_columns)) in this low level environment.
   * Build your own training loop, instead of using the one
-    @{$premade_estimators$provided by Estimators}.
+    [provided by Estimators](../guide/premade_estimators.md).
 
 We recommend using the higher level APIs to build models when possible.
 Knowing TensorFlow Core is valuable for the following reasons:
@@ -21,7 +21,7 @@
 
 ## Setup
 
-Before using this guide, @{$install$install TensorFlow}.
+Before using this guide, [install TensorFlow](../install/index.md).
 
 To get the most out of this guide, you should know the following:
 
@@ -63,17 +63,17 @@
 You might think of TensorFlow Core programs as consisting of two discrete
 sections:
 
-1.  Building the computational graph (a @{tf.Graph}).
-2.  Running the computational graph (using a @{tf.Session}).
+1.  Building the computational graph (a `tf.Graph`).
+2.  Running the computational graph (using a `tf.Session`).
 
 ### Graph
 
 A **computational graph** is a series of TensorFlow operations arranged into a
 graph. The graph is composed of two types of objects.
 
-  * @{tf.Operation$Operations} (or "ops"): The nodes of the graph.
+  * `tf.Operation` (or "ops"): The nodes of the graph.
     Operations describe calculations that consume and produce tensors.
-  * @{tf.Tensor$Tensors}: The edges in the graph. These represent the values
+  * `tf.Tensor`: The edges in the graph. These represent the values
     that will flow through the graph. Most TensorFlow functions return
     `tf.Tensors`.
 
@@ -145,11 +145,11 @@
 
 ![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
 
-For more about TensorBoard's graph visualization tools see @{$graph_viz}.
+For more about TensorBoard's graph visualization tools see [TensorBoard: Graph Visualization](../guide/graph_viz.md).
 
 ### Session
 
-To evaluate tensors, instantiate a @{tf.Session} object, informally known as a
+To evaluate tensors, instantiate a `tf.Session` object, informally known as a
 **session**. A session encapsulates the state of the TensorFlow runtime, and
 runs TensorFlow operations. If a `tf.Graph` is like a `.py` file, a `tf.Session`
 is like the `python` executable.
@@ -232,7 +232,7 @@
 The preceding three lines are a bit like a function in which we
 define two input parameters (`x` and `y`) and then an operation on them. We can
 evaluate this graph with multiple inputs by using the `feed_dict` argument of
-the @{tf.Session.run$run method} to feed concrete values to the placeholders:
+the `tf.Session.run` method to feed concrete values to the placeholders:
 
 ```python
 print(sess.run(z, feed_dict={x: 3, y: 4.5}))
@@ -251,15 +251,15 @@
 
 ## Datasets
 
-Placeholders work for simple experiments, but @{tf.data$Datasets} are the
+Placeholders work for simple experiments, but `tf.data` are the
 preferred method of streaming data into a model.
 
 To get a runnable `tf.Tensor` from a Dataset you must first convert it to a
-@{tf.data.Iterator}, and then call the Iterator's
-@{tf.data.Iterator.get_next$`get_next`} method.
+`tf.data.Iterator`, and then call the Iterator's
+`tf.data.Iterator.get_next` method.
 
 The simplest way to create an Iterator is with the
-@{tf.data.Dataset.make_one_shot_iterator$`make_one_shot_iterator`} method.
+`tf.data.Dataset.make_one_shot_iterator` method.
 For example, in the following code the `next_item` tensor will return a row from
 the `my_data` array on each `run` call:
 
@@ -275,7 +275,7 @@
 ```
 
 Reaching the end of the data stream causes `Dataset` to throw an
-@{tf.errors.OutOfRangeError$`OutOfRangeError`}. For example, the following code
+`tf.errors.OutOfRangeError`. For example, the following code
 reads the `next_item` until there is no more data to read:
 
 ``` python
@@ -303,12 +303,12 @@
     break
 ```
 
-For more details on Datasets and Iterators see: @{$guide/datasets}.
+For more details on Datasets and Iterators see: [Importing Data](../guide/datasets.md).
 
 ## Layers
 
 A trainable model must modify the values in the graph to get new outputs with
-the same input.  @{tf.layers$Layers} are the preferred way to add trainable
+the same input.  `tf.layers` are the preferred way to add trainable
 parameters to a graph.
 
 Layers package together both the variables and the operations that act
@@ -321,7 +321,7 @@
 
 ### Creating Layers
 
-The following code creates a @{tf.layers.Dense$`Dense`} layer that takes a
+The following code creates a `tf.layers.Dense` layer that takes a
 batch of input vectors, and produces a single output value for each. To apply a
 layer to an input, call the layer as if it were a function. For example:
 
@@ -375,8 +375,8 @@
 
 ### Layer Function shortcuts
 
-For each layer class (like @{tf.layers.Dense}) TensorFlow also supplies a
-shortcut function (like @{tf.layers.dense}). The only difference is that the
+For each layer class (like `tf.layers.Dense`) TensorFlow also supplies a
+shortcut function (like `tf.layers.dense`). The only difference is that the
 shortcut function versions create and run the layer in a single call. For
 example, the following code is equivalent to the earlier version:
 
@@ -390,17 +390,17 @@
 print(sess.run(y, {x: [[1, 2, 3], [4, 5, 6]]}))
 ```
 
-While convenient, this approach allows no access to the @{tf.layers.Layer}
+While convenient, this approach allows no access to the `tf.layers.Layer`
 object. This makes introspection and debugging more difficult,
 and layer reuse impossible.
 
 ## Feature columns
 
 The easiest way to experiment with feature columns is using the
-@{tf.feature_column.input_layer} function. This function only accepts
-@{$feature_columns$dense columns} as inputs, so to view the result
+`tf.feature_column.input_layer` function. This function only accepts
+[dense columns](../guide/feature_columns.md) as inputs, so to view the result
 of a categorical column you must wrap it in an
-@{tf.feature_column.indicator_column}. For example:
+`tf.feature_column.indicator_column`. For example:
 
 ``` python
 features = {
@@ -422,9 +422,9 @@
 Running the `inputs` tensor will parse the `features` into a batch of vectors.
 
 Feature columns can have internal state, like layers, so they often need to be
-initialized. Categorical columns use @{tf.contrib.lookup$lookup tables}
+initialized. Categorical columns use `tf.contrib.lookup`
 internally and these require a separate initialization op,
-@{tf.tables_initializer}.
+`tf.tables_initializer`.
 
 ``` python
 var_init = tf.global_variables_initializer()
@@ -501,7 +501,7 @@
 square error, a standard loss for regression problems.
 
 While you could do this manually with lower level math operations,
-the @{tf.losses} module provides a set of common loss functions. You can use it
+the `tf.losses` module provides a set of common loss functions. You can use it
 to calculate the mean square error as follows:
 
 ``` python
@@ -520,10 +520,10 @@
 TensorFlow provides
 [**optimizers**](https://developers.google.com/machine-learning/glossary/#optimizer)
 implementing standard optimization algorithms. These are implemented as
-sub-classes of @{tf.train.Optimizer}. They incrementally change each
+sub-classes of `tf.train.Optimizer`. They incrementally change each
 variable in order to minimize the loss. The simplest optimization algorithm is
 [**gradient descent**](https://developers.google.com/machine-learning/glossary/#gradient_descent),
-implemented by @{tf.train.GradientDescentOptimizer}. It modifies each
+implemented by `tf.train.GradientDescentOptimizer`. It modifies each
 variable according to the magnitude of the derivative of loss with respect to
 that variable. For example:
 
@@ -589,7 +589,7 @@
 
 To learn more about building models with TensorFlow consider the following:
 
-* @{$custom_estimators$Custom Estimators}, to learn how to build
+* [Custom Estimators](../guide/custom_estimators.md), to learn how to build
   customized models with TensorFlow. Your knowledge of TensorFlow Core will
   help you understand and debug your own models.
 
@@ -597,8 +597,8 @@
 following documents, which go into more depth on many of the topics discussed
 here:
 
-* @{$graphs}
-* @{$tensors}
-* @{$variables}
+* [Graphs and Sessions](../guide/graphs.md)
+* [Tensors](../guide/tensors.md)
+* [Variables](../guide/variables.md)
 
 
diff --git a/tensorflow/docs_src/guide/premade_estimators.md b/tensorflow/docs_src/guide/premade_estimators.md
index 3e910c1..a170305 100644
--- a/tensorflow/docs_src/guide/premade_estimators.md
+++ b/tensorflow/docs_src/guide/premade_estimators.md
@@ -8,7 +8,7 @@
 Prior to using the sample code in this document, you'll need to do the
 following:
 
-* @{$install$Install TensorFlow}.
+* [Install TensorFlow](../install/index.md).
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
 * Install or upgrade pandas by issuing the following command:
@@ -78,10 +78,10 @@
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* @{$guide/estimators$Estimators}, which represent a complete model.
+* [Estimators](../guide/estimators.md), which represent a complete model.
   The Estimator API provides methods to train the model, to judge the model's
   accuracy, and to generate predictions.
-* @{$guide/datasets_for_estimators}, which build a data input
+* [Datasets for Estimators](../guide/datasets_for_estimators.md), which build a data input
   pipeline. The Dataset API has methods to load and manipulate data, and feed
   it into your model. The Dataset API meshes well with the Estimators API.
 
@@ -173,14 +173,14 @@
 An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
-@{$guide/estimators}.
+[Estimators](../guide/estimators.md).
 
-An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
+An Estimator is any class derived from `tf.estimator.Estimator`. TensorFlow
 provides a collection of
-@{tf.estimator$pre-made Estimators}
+`tf.estimator`
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
 those, you may write your own
-@{$custom_estimators$custom Estimators}.
+[custom Estimators](../guide/custom_estimators.md).
 We recommend using pre-made Estimators when just getting started.
 
 To write a TensorFlow program based on pre-made Estimators, you must perform the
@@ -200,7 +200,7 @@
 You must create input functions to supply data for training,
 evaluating, and prediction.
 
-An **input function** is a function that returns a @{tf.data.Dataset} object
+An **input function** is a function that returns a `tf.data.Dataset` object
 which outputs the following two-element tuple:
 
 * [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which:
@@ -271,7 +271,7 @@
 is an object describing how the model should use raw input data from the
 features dictionary. When you build an Estimator model, you pass it a list of
 feature columns that describes each of the features you want the model to use.
-The @{tf.feature_column} module provides many options for representing data
+The `tf.feature_column` module provides many options for representing data
 to the model.
 
 For Iris, the 4 raw features are numeric values, so we'll build a list of
@@ -287,7 +287,7 @@
 ```
 
 Feature columns can be far more sophisticated than those we're showing here.  We
-detail feature columns @{$feature_columns$later on} in our Getting
+detail feature columns [later on](../guide/feature_columns.md) in our Getting
 Started guide.
 
 Now that we have the description of how we want the model to represent the raw
@@ -299,10 +299,10 @@
 The Iris problem is a classic classification problem. Fortunately, TensorFlow
 provides several pre-made classifier Estimators, including:
 
-* @{tf.estimator.DNNClassifier} for deep models that perform multi-class
+* `tf.estimator.DNNClassifier` for deep models that perform multi-class
   classification.
-* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models.
-* @{tf.estimator.LinearClassifier} for classifiers based on linear models.
+* `tf.estimator.DNNLinearCombinedClassifier` for wide & deep models.
+* `tf.estimator.LinearClassifier` for classifiers based on linear models.
 
 For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
 Here's how we instantiated this Estimator:
@@ -423,8 +423,8 @@
 Now that you've gotten started writing TensorFlow programs, consider the
 following material:
 
-* @{$checkpoints$Checkpoints} to learn how to save and restore models.
-* @{$guide/datasets_for_estimators} to learn more about importing
+* [Checkpoints](../guide/checkpoints.md) to learn how to save and restore models.
+* [Datasets for Estimators](../guide/datasets_for_estimators.md) to learn more about importing
   data into your model.
-* @{$custom_estimators$Creating Custom Estimators} to learn how to
+* [Creating Custom Estimators](../guide/custom_estimators.md) to learn how to
   write your own Estimator, customized for a particular problem.
diff --git a/tensorflow/docs_src/guide/saved_model.md b/tensorflow/docs_src/guide/saved_model.md
index 717488e..6c967fd 100644
--- a/tensorflow/docs_src/guide/saved_model.md
+++ b/tensorflow/docs_src/guide/saved_model.md
@@ -1,13 +1,13 @@
 # Save and Restore
 
-The @{tf.train.Saver} class provides methods to save and restore models. The
-@{tf.saved_model.simple_save} function is an easy way to build a
-@{tf.saved_model$saved model} suitable for serving. [Estimators](./estimators)
+The `tf.train.Saver` class provides methods to save and restore models. The
+`tf.saved_model.simple_save` function is an easy way to build a
+`tf.saved_model` suitable for serving. [Estimators](./estimators)
 automatically save and restore variables in the `model_dir`.
 
 ## Save and restore variables
 
-TensorFlow @{$variables} are the best way to represent shared, persistent state
+TensorFlow [Variables](../guide/variables.md) are the best way to represent shared, persistent state
 manipulated by your program. The `tf.train.Saver` constructor adds `save` and
 `restore` ops to the graph for all, or a specified list, of the variables in the
 graph.  The `Saver` object provides methods to run these ops, specifying paths
@@ -145,13 +145,13 @@
 
 *  If you only restore a subset of the model variables at the start of a
    session, you have to run an initialize op for the other variables.  See
-   @{tf.variables_initializer} for more information.
+   `tf.variables_initializer` for more information.
 
 *  To inspect the variables in a checkpoint, you can use the
    [`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py)
    library, particularly the `print_tensors_in_checkpoint_file` function.
 
-*  By default, `Saver` uses the value of the @{tf.Variable.name} property
+*  By default, `Saver` uses the value of the `tf.Variable.name` property
    for each variable.  However, when you create a `Saver` object, you may
    optionally choose names for the variables in the checkpoint files.
 
@@ -196,15 +196,15 @@
 graph's metadata. This is a language-neutral, recoverable, hermetic
 serialization format that enables higher-level systems and tools to produce,
 consume, and transform TensorFlow models. TensorFlow provides several ways to
-interact with `SavedModel`, including the @{tf.saved_model} APIs,
-@{tf.estimator.Estimator}, and a command-line interface.
+interact with `SavedModel`, including the `tf.saved_model` APIs,
+`tf.estimator.Estimator`, and a command-line interface.
 
 
 ## Build and load a SavedModel
 
 ### Simple save
 
-The easiest way to create a `SavedModel` is to use the @{tf.saved_model.simple_save}
+The easiest way to create a `SavedModel` is to use the `tf.saved_model.simple_save`
 function:
 
 ```python
@@ -218,14 +218,14 @@
 [TensorFlow serving](/serving/serving_basic) and supports the
 [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
 To access the classify, regress, or multi-inference APIs, use the manual
-`SavedModel` builder APIs or an @{tf.estimator.Estimator}.
+`SavedModel` builder APIs or an `tf.estimator.Estimator`.
 
 ### Manually build a SavedModel
 
-If your use case isn't covered by @{tf.saved_model.simple_save}, use the manual
-@{tf.saved_model.builder$builder APIs} to create a `SavedModel`.
+If your use case isn't covered by `tf.saved_model.simple_save`, use the manual
+`tf.saved_model.builder` to create a `SavedModel`.
 
-The @{tf.saved_model.builder.SavedModelBuilder} class provides functionality to
+The `tf.saved_model.builder.SavedModelBuilder` class provides functionality to
 save multiple `MetaGraphDef`s.  A **MetaGraph** is a dataflow graph, plus
 its associated variables, assets, and signatures.  A **`MetaGraphDef`**
 is the protocol buffer representation of a MetaGraph.  A **signature** is
@@ -272,16 +272,16 @@
 Following the guidance below gives you forward compatibility only if the set of
 Ops has not changed.
 
-The @{tf.saved_model.builder.SavedModelBuilder$`SavedModelBuilder`} class allows
+The `tf.saved_model.builder.SavedModelBuilder` class allows
 users to control whether default-valued attributes must be stripped from the
-@{$extend/tool_developers#nodes$`NodeDefs`}
+[`NodeDefs`](../extend/tool_developers/index.md#nodes)
 while adding a meta graph to the SavedModel bundle. Both
-@{tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables$`SavedModelBuilder.add_meta_graph_and_variables`}
-and @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph$`SavedModelBuilder.add_meta_graph`}
+`tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables`
+and `tf.saved_model.builder.SavedModelBuilder.add_meta_graph`
 methods accept a Boolean flag `strip_default_attrs` that controls this behavior.
 
-If `strip_default_attrs` is `False`, the exported @{tf.MetaGraphDef} will have
-the default valued attributes in all its @{tf.NodeDef} instances.
+If `strip_default_attrs` is `False`, the exported `tf.MetaGraphDef` will have
+the default valued attributes in all its `tf.NodeDef` instances.
 This can break forward compatibility with a sequence of events such as the
 following:
 
@@ -304,7 +304,7 @@
 ### Loading a SavedModel in Python
 
 The Python version of the SavedModel
-@{tf.saved_model.loader$loader}
+`tf.saved_model.loader`
 provides load and restore capability for a SavedModel. The `load` operation
 requires the following information:
 
@@ -413,7 +413,7 @@
 
 ### Prepare serving inputs
 
-During training, an @{$premade_estimators#input_fn$`input_fn()`} ingests data
+During training, an [`input_fn()`](../guide/premade_estimators.md#input_fn) ingests data
 and prepares it for use by the model.  At serving time, similarly, a
 `serving_input_receiver_fn()` accepts inference requests and prepares them for
 the model.  This function has the following purposes:
@@ -423,20 +423,20 @@
 *  To add any additional ops needed to convert data from the input format
    into the feature `Tensor`s expected by the model.
 
-The function returns a @{tf.estimator.export.ServingInputReceiver} object,
+The function returns a `tf.estimator.export.ServingInputReceiver` object,
 which packages the placeholders and the resulting feature `Tensor`s together.
 
 A typical pattern is that inference requests arrive in the form of serialized
 `tf.Example`s, so the `serving_input_receiver_fn()` creates a single string
 placeholder to receive them.  The `serving_input_receiver_fn()` is then also
-responsible for parsing the `tf.Example`s by adding a @{tf.parse_example} op to
+responsible for parsing the `tf.Example`s by adding a `tf.parse_example` op to
 the graph.
 
 When writing such a `serving_input_receiver_fn()`, you must pass a parsing
-specification to @{tf.parse_example} to tell the parser what feature names to
+specification to `tf.parse_example` to tell the parser what feature names to
 expect and how to map them to `Tensor`s. A parsing specification takes the
-form of a dict from feature names to @{tf.FixedLenFeature}, @{tf.VarLenFeature},
-and @{tf.SparseFeature}.  Note this parsing specification should not include
+form of a dict from feature names to `tf.FixedLenFeature`, `tf.VarLenFeature`,
+and `tf.SparseFeature`.  Note this parsing specification should not include
 any label or weight columns, since those will not be available at serving
 time&mdash;in contrast to a parsing specification used in the `input_fn()` at
 training time.
@@ -457,7 +457,7 @@
   return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
 ```
 
-The @{tf.estimator.export.build_parsing_serving_input_receiver_fn} utility
+The `tf.estimator.export.build_parsing_serving_input_receiver_fn` utility
 function provides that input receiver for the common case.
 
 > Note: when training a model to be served using the Predict API with a local
@@ -468,7 +468,7 @@
 serving system will feed feature `Tensor`s directly&mdash;you must still provide
 a `serving_input_receiver_fn()` that creates placeholders for the feature
 `Tensor`s and passes them through.  The
-@{tf.estimator.export.build_raw_serving_input_receiver_fn} utility provides for
+`tf.estimator.export.build_raw_serving_input_receiver_fn` utility provides for
 this.
 
 If these utilities do not meet your needs, you are free to write your own
@@ -488,7 +488,7 @@
 ### Specify the outputs of a custom model
 
 When writing a custom `model_fn`, you must populate the `export_outputs` element
-of the @{tf.estimator.EstimatorSpec} return value. This is a dict of
+of the `tf.estimator.EstimatorSpec` return value. This is a dict of
 `{name: output}` describing the output signatures to be exported and used during
 serving.
 
@@ -498,9 +498,9 @@
 of your choice that can be used to request a specific head at serving time.
 
 Each `output` value must be an `ExportOutput` object  such as
-@{tf.estimator.export.ClassificationOutput},
-@{tf.estimator.export.RegressionOutput}, or
-@{tf.estimator.export.PredictOutput}.
+`tf.estimator.export.ClassificationOutput`,
+`tf.estimator.export.RegressionOutput`, or
+`tf.estimator.export.PredictOutput`.
 
 These output types map straightforwardly to the
 [TensorFlow Serving APIs](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto),
@@ -520,7 +520,7 @@
 ### Perform the export
 
 To export your trained Estimator, call
-@{tf.estimator.Estimator.export_savedmodel} with the export base path and
+`tf.estimator.Estimator.export_savedmodel` with the export base path and
 the `serving_input_receiver_fn`.
 
 ```py
@@ -616,7 +616,7 @@
 The returned result in this example is a `ClassificationResponse` protocol
 buffer.
 
-This is a skeletal example; please see the @{$deploy$Tensorflow Serving}
+This is a skeletal example; please see the [Tensorflow Serving](../deploy/index.md)
 documentation and [examples](https://github.com/tensorflow/serving/tree/master/tensorflow_serving/example)
 for more details.
 
@@ -647,7 +647,7 @@
 execute a SavedModel.
 For example, you can use the CLI to inspect the model's `SignatureDef`s.
 The CLI enables you to quickly confirm that the input
-@{$tensors$Tensor dtype and shape} match the model. Moreover, if you
+[Tensor dtype and shape](../guide/tensors.md) match the model. Moreover, if you
 want to test your model, you can use the CLI to do a sanity check by
 passing in sample inputs in various formats (for example, Python
 expressions) and then fetching the output.
diff --git a/tensorflow/docs_src/guide/summaries_and_tensorboard.md b/tensorflow/docs_src/guide/summaries_and_tensorboard.md
index fadfa03..788c556 100644
--- a/tensorflow/docs_src/guide/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/guide/summaries_and_tensorboard.md
@@ -36,12 +36,12 @@
 
 First, create the TensorFlow graph that you'd like to collect summary
 data from, and decide which nodes you would like to annotate with
-@{$python/summary$summary operations}.
+[summary operations](../api_guides/python/summary.md).
 
 For example, suppose you are training a convolutional neural network for
 recognizing MNIST digits. You'd like to record how the learning rate
 varies over time, and how the objective function is changing. Collect these by
-attaching @{tf.summary.scalar} ops
+attaching `tf.summary.scalar` ops
 to the nodes that output the learning rate and loss respectively. Then, give
 each `scalar_summary` a meaningful `tag`, like `'learning rate'` or `'loss
 function'`.
@@ -49,24 +49,24 @@
 Perhaps you'd also like to visualize the distributions of activations coming
 off a particular layer, or the distribution of gradients or weights. Collect
 this data by attaching
-@{tf.summary.histogram} ops to
+`tf.summary.histogram` ops to
 the gradient outputs and to the variable that holds your weights, respectively.
 
 For details on all of the summary operations available, check out the docs on
-@{$python/summary$summary operations}.
+[summary operations](../api_guides/python/summary.md).
 
 Operations in TensorFlow don't do anything until you run them, or an op that
 depends on their output. And the summary nodes that we've just created are
 peripheral to your graph: none of the ops you are currently running depend on
 them. So, to generate summaries, we need to run all of these summary nodes.
 Managing them by hand would be tedious, so use
-@{tf.summary.merge_all}
+`tf.summary.merge_all`
 to combine them into a single op that generates all the summary data.
 
 Then, you can just run the merged summary op, which will generate a serialized
 `Summary` protobuf object with all of your summary data at a given step.
 Finally, to write this summary data to disk, pass the summary protobuf to a
-@{tf.summary.FileWriter}.
+`tf.summary.FileWriter`.
 
 The `FileWriter` takes a logdir in its constructor - this logdir is quite
 important, it's the directory where all of the events will be written out.
@@ -74,7 +74,7 @@
 If it receives a `Graph` object, then TensorBoard will visualize your graph
 along with tensor shape information. This will give you a much better sense of
 what flows through the graph: see
-@{$graph_viz#tensor-shape-information$Tensor shape information}.
+[Tensor shape information](../guide/graph_viz.md#tensor-shape-information).
 
 Now that you've modified your graph and have a `FileWriter`, you're ready to
 start running your network! If you want, you could run the merged summary op
@@ -219,7 +219,7 @@
 corner. Each tab represents a set of serialized data that can be visualized.
 
 For in depth information on how to use the *graph* tab to visualize your graph,
-see @{$graph_viz$TensorBoard: Graph Visualization}.
+see [TensorBoard: Graph Visualization](../guide/graph_viz.md).
 
 For more usage information on TensorBoard in general, see the
 [TensorBoard GitHub](https://github.com/tensorflow/tensorboard).
diff --git a/tensorflow/docs_src/guide/tensors.md b/tensorflow/docs_src/guide/tensors.md
index 7227260..4f0ddb2 100644
--- a/tensorflow/docs_src/guide/tensors.md
+++ b/tensorflow/docs_src/guide/tensors.md
@@ -176,7 +176,7 @@
 n | [D0, D1, ... Dn-1] | n-D | A tensor with shape [D0, D1, ... Dn-1].
 
 Shapes can be represented via Python lists / tuples of ints, or with the
-@{tf.TensorShape}.
+`tf.TensorShape`.
 
 ### Getting a `tf.Tensor` object's shape
 
@@ -298,7 +298,7 @@
 ## Printing Tensors
 
 For debugging purposes you might want to print the value of a `tf.Tensor`. While
- @{$debugger$tfdbg} provides advanced debugging support, TensorFlow also has an
+ [tfdbg](../guide/debugger.md) provides advanced debugging support, TensorFlow also has an
  operation to directly print the value of a `tf.Tensor`.
 
 Note that you rarely want to use the following pattern when printing a
diff --git a/tensorflow/docs_src/guide/using_gpu.md b/tensorflow/docs_src/guide/using_gpu.md
index c0218fd..8cb9b35 100644
--- a/tensorflow/docs_src/guide/using_gpu.md
+++ b/tensorflow/docs_src/guide/using_gpu.md
@@ -211,5 +211,5 @@
  [  98.  128.]]
 ```
 
-The @{$deep_cnn$cifar10 tutorial} is a good example
+The [cifar10 tutorial](../tutorials/images/deep_cnn.md) is a good example
 demonstrating how to do training with multiple GPUs.
diff --git a/tensorflow/docs_src/guide/using_tpu.md b/tensorflow/docs_src/guide/using_tpu.md
index 41d80d9..59b34e1 100644
--- a/tensorflow/docs_src/guide/using_tpu.md
+++ b/tensorflow/docs_src/guide/using_tpu.md
@@ -17,13 +17,13 @@
 
 ## TPUEstimator
 
-@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction.
+`tf.estimator.Estimator` are TensorFlow's model-level abstraction.
 Standard `Estimators` can drive models on CPU and GPUs. You must use
-@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
+`tf.contrib.tpu.TPUEstimator` to drive a model on TPUs.
 
 Refer to TensorFlow's Getting Started section for an introduction to the basics
-of using a @{$premade_estimators$pre-made `Estimator`}, and
-@{$custom_estimators$custom `Estimator`s}.
+of using a [pre-made `Estimator`](../guide/premade_estimators.md), and
+[custom `Estimator`s](../guide/custom_estimators.md).
 
 The `TPUEstimator` class differs somewhat from the `Estimator` class.
 
@@ -44,10 +44,10 @@
   model_fn=my_model_fn)
 ```
 
-The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local
+The changes required to use a `tf.contrib.tpu.TPUEstimator` on your local
 machine are relatively minor. The constructor requires two additional arguments.
 You should set the `use_tpu` argument to `False`, and pass a
-@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below:
+`tf.contrib.tpu.RunConfig` as the `config` argument, as shown below:
 
 ``` python
 my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
@@ -117,7 +117,7 @@
 )
 ```
 
-Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor:
+Then you must pass the `tf.contrib.tpu.RunConfig` to the constructor:
 
 ``` python
 my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
@@ -137,7 +137,7 @@
 ## Optimizer
 
 When training on a cloud TPU you **must** wrap the optimizer in a
-@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate
+`tf.contrib.tpu.CrossShardOptimizer`, which uses an `allreduce` to aggregate
 gradients and broadcast the result to each shard (each TPU core).
 
 The `CrossShardOptimizer` is not compatible with local training. So, to have
@@ -171,9 +171,9 @@
 During regular usage TensorFlow attempts to determine the shapes of each
 `tf.Tensor` during graph construction. During execution any unknown shape
 dimensions are determined dynamically,
-see @{$guide/tensors#shape$Tensor Shapes} for more details.
+see [Tensor Shapes](../guide/tensors.md#shape) for more details.
 
-To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+To run on Cloud TPUs TensorFlow models are compiled using [XLA](../performance/xla/index.md).
 XLA uses a similar system for determining shapes at compile time. XLA requires
 that all tensor dimensions be statically defined at compile time. All shapes
 must evaluate to a constant, and not depend on external data, or stateful
@@ -184,7 +184,7 @@
 
 Remove any use of `tf.summary` from your model.
 
-@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+[TensorBoard summaries](../guide/summaries_and_tensorboard.md) are a great way see inside
 your model. A minimal set of basic summaries are automatically recorded by the
 `TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
 are currently unsupported when training on a Cloud TPU. So while the
@@ -200,7 +200,7 @@
 Evaluation metrics are an essential part of training a model. These are fully
 supported on Cloud TPUs, but with a slightly different syntax.
 
-A standard @{tf.metrics} returns two tensors. The first returns the running
+A standard `tf.metrics` returns two tensors. The first returns the running
 average of the metric value, while the second updates the running average and
 returns the value for this batch:
 
@@ -242,15 +242,15 @@
 is a simple structure of named fields containing all the `tf.Tensors` of the
 model that the `Estimator` may need to interact with.
 
-`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few
-differences between it and a standard @{tf.estimator.EstimatorSpec}:
+`TPUEstimators` use a `tf.contrib.tpu.TPUEstimatorSpec`. There are a few
+differences between it and a standard `tf.estimator.EstimatorSpec`:
 
 
 *  The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is
    renamed `eval_metrics` ([see above](#metrics)).
-*  The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are
+*  The `tf.train.SessionRunHook` are unsupported, so these fields are
    omitted.
-*  The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a
+*  The `tf.train.Scaffold`, if used, must also be wrapped in a
    function. This field is renamed to `scaffold_fn`.
 
 `Scaffold` and `Hooks` are for advanced usage, and can typically be omitted.
@@ -304,7 +304,7 @@
 A typical input pipeline, using `tf.data`, will usually produce batches of a
 fixed size. The last batch of a finite `Dataset`, however, is typically smaller,
 containing just the remaining elements. Since a `Dataset` does not know its own
-length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method
+length or finiteness, the standard `tf.data.Dataset.batch` method
 cannot determine if all batches will have a fixed size batch on its own:
 
 ```
@@ -317,7 +317,7 @@
 ```
 
 The most straightforward fix is to
-@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder}
+`tf.data.Dataset.apply` `tf.contrib.data.batch_and_drop_remainder`
 as follows:
 
 ```
@@ -343,25 +343,25 @@
 
 Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
 TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
-quickly enough. See @{$datasets_performance} for details on dataset performance.
+quickly enough. See [Input Pipeline Performance Guide](../performance/datasets_performance.md) for details on dataset performance.
 
 For all but the simplest experimentation (using
-@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to
+`tf.data.Dataset.from_tensor_slices` or other in-graph data) you will need to
 store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud
 Storage Buckets.
 
 <!--TODO(markdaoust): link to the `TFRecord` doc when it exists.-->
 
 For most use-cases, we recommend converting your data into `TFRecord`
-format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not
+format and using a `tf.data.TFRecordDataset` to read it. This, however, is not
 a hard requirement and you can use other dataset readers
 (`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer.
 
 Small datasets can be loaded entirely into memory using
-@{tf.data.Dataset.cache}.
+`tf.data.Dataset.cache`.
 
 Regardless of the data format used, it is strongly recommended that you
-@{$performance_guide#use_large_files$use large files}, on the order of
+[use large files](../performance/performance_guide.md#use_large_files), on the order of
 100MB. This is especially important in this networked setting as the overhead
 of opening a file is significantly higher.
 
@@ -391,5 +391,5 @@
 
 For more information about tuning TensorFlow code for performance see:
 
- * The @{$performance$Performance Section.}
+ * The [Performance Section.](../performance/index.md)
 
diff --git a/tensorflow/docs_src/guide/variables.md b/tensorflow/docs_src/guide/variables.md
index cd8c4b5..5d5d733 100644
--- a/tensorflow/docs_src/guide/variables.md
+++ b/tensorflow/docs_src/guide/variables.md
@@ -119,7 +119,7 @@
 distributed settings. Accidentally putting variables on workers instead of
 parameter servers, for example, can severely slow down training or, in the worst
 case, let each worker blithely forge ahead with its own independent copy of each
-variable. For this reason we provide @{tf.train.replica_device_setter}, which
+variable. For this reason we provide `tf.train.replica_device_setter`, which
 can automatically place variables in parameter servers. For example:
 
 ``` python
@@ -211,7 +211,7 @@
 
 Most TensorFlow optimizers have specialized ops that efficiently update the
 values of variables according to some gradient descent-like algorithm. See
-@{tf.train.Optimizer} for an explanation of how to use optimizers.
+`tf.train.Optimizer` for an explanation of how to use optimizers.
 
 Because variables are mutable it's sometimes useful to know what version of a
 variable's value is being used at any point in time. To force a re-read of the
diff --git a/tensorflow/docs_src/guide/version_compat.md b/tensorflow/docs_src/guide/version_compat.md
index d2e5e41..882f2a3 100644
--- a/tensorflow/docs_src/guide/version_compat.md
+++ b/tensorflow/docs_src/guide/version_compat.md
@@ -66,7 +66,7 @@
 Some API functions are explicitly marked as "experimental" and can change in
 backward incompatible ways between minor releases. These include:
 
-*   **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
+*   **Experimental APIs**: The `tf.contrib` module and its submodules in Python
     and any functions in the C API or fields in protocol buffers that are
     explicitly commented as being experimental. In particular, any field in a
     protocol buffer which is called "experimental" and all its fields and
@@ -75,10 +75,11 @@
 *   **Other languages**: TensorFlow APIs in languages other than Python and C,
     such as:
 
-  - @{$cc/guide$C++} (exposed through header files in
+  - [C++](../api_guides/cc/guide.md) (exposed through header files in
     [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
   - [Java](../api_docs/java/reference/org/tensorflow/package-summary),
   - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
+  - [JavaScript](https://js.tensorflow.org)
 
 *   **Details of composite ops:** Many public functions in Python expand to
     several primitive ops in the graph, and these details will be part of any
@@ -97,7 +98,7 @@
     accuracy for the overall system.
 
 *   **Random numbers:** The specific random numbers computed by the
-    @{$python/constant_op#Random_Tensors$random ops} may change at any time.
+    [random ops](../api_guides/python/constant_op.md#Random_Tensors) may change at any time.
     Users should rely only on approximately correct distributions and
     statistical strength, not the specific bits computed. However, we will make
     changes to random bits rarely (or perhaps never) for patch releases.  We
@@ -174,6 +175,8 @@
 format, such as when adding ops, removing ops, or changing the functionality
 of existing ops.  The previous section should suffice for most users.
 
+<a id="backward_forward"/>
+
 ### Backward and partial forward compatibility
 
 Our versioning scheme has three requirements:
@@ -252,13 +255,13 @@
 
 1. If forward compatibility is desired,  set `strip_default_attrs` to `True`
    while exporting the model using either the
-   @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables$`add_meta_graph_and_variables`}
-   and @{tf.saved_model.builder.SavedModelBuilder.add_meta_graph$`add_meta_graph`}
+   `tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables`
+   and `tf.saved_model.builder.SavedModelBuilder.add_meta_graph`
    methods of the `SavedModelBuilder` class, or
-   @{tf.estimator.Estimator.export_savedmodel$`Estimator.export_savedmodel`}
+   `tf.estimator.Estimator.export_savedmodel`
 2. This strips off the default valued attributes at the time of
    producing/exporting the models. This makes sure that the exported
-   @{tf.MetaGraphDef} does not contain the new op-attribute when the default
+   `tf.MetaGraphDef` does not contain the new op-attribute when the default
    value is used.
 3. Having this control could allow out-of-date consumers (for example, serving
    binaries that lag behind training binaries) to continue loading the models
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index 55481cc..76e590e 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -17,23 +17,23 @@
 The following guides explain how to install a version of TensorFlow
 that enables you to write applications in Python:
 
-  * @{$install_linux$Install TensorFlow on Ubuntu}
-  * @{$install_mac$Install TensorFlow on macOS}
-  * @{$install_windows$Install TensorFlow on Windows}
-  * @{$install_raspbian$Install TensorFlow on a Raspberry Pi}
-  * @{$install_sources$Install TensorFlow from source code}
+  * [Install TensorFlow on Ubuntu](../install/install_linux.md)
+  * [Install TensorFlow on macOS](../install/install_mac.md)
+  * [Install TensorFlow on Windows](../install/install_windows.md)
+  * [Install TensorFlow on a Raspberry Pi](../install/install_raspbian.md)
+  * [Install TensorFlow from source code](../install/install_sources.md)
 
 Many aspects of the Python TensorFlow API changed from version 0.n to 1.0.
 The following guide explains how to migrate older TensorFlow applications
 to Version 1.0:
 
-  * @{$migration$Transition to TensorFlow 1.0}
+  * [Transition to TensorFlow 1.0](../install/migration.md)
 
 The following guides explain how to install TensorFlow libraries for use in
 other programming languages. These APIs are aimed at deploying TensorFlow
 models in applications and are not as extensive as the Python APIs.
 
-  * @{$install_java$Install TensorFlow for Java}
-  * @{$install_c$Install TensorFlow for C}
-  * @{$install_go$Install TensorFlow for Go}
+  * [Install TensorFlow for Java](../install/install_java.md)
+  * [Install TensorFlow for C](../install/install_c.md)
+  * [Install TensorFlow for Go](../install/install_go.md)
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 5e26fac..084634b 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -28,8 +28,8 @@
      entitled "Determine which TensorFlow to install" in one of the
      following guides:
 
-       * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-       * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+       * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+       * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   2. Download and extract the TensorFlow C library into `/usr/local/lib` by
      invoking the following shell commands:
@@ -38,7 +38,7 @@
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.10.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.10.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index a59c274..0c604d7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -6,7 +6,7 @@
 [TensorFlow Go package](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go).
 
 Warning: The TensorFlow Go API is *not* covered by the TensorFlow
-[API stability guarantees](../guide/version_semantics.md).
+[API stability guarantees](../guide/version_compat.md).
 
 
 ## Supported Platforms
@@ -29,8 +29,8 @@
      the help of GPU(s). To help you decide, read the section entitled
      "Determine which TensorFlow to install" in one of the following guides:
 
-     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+     * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+     * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   2. Download and extract the TensorFlow C library into `/usr/local/lib` by
      invoking the following shell commands:
@@ -38,7 +38,7 @@
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.10.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.10.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index e9c6650..c411cb7 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.10.0-rc1</version>
+  <version>1.10.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.10.0-rc1</version>
+                 <version>1.10.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,18 +124,18 @@
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.10.0-rc1</version>
+  <version>1.10.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.10.0-rc1</version>
+  <version>1.10.0</version>
 </dependency>
 ```
 
 GPU acceleration is available via Maven only for Linux and only if your system
 meets the
-@{$install_linux#determine_which_tensorflow_to_install$requirements for GPU}.
+[requirements for GPU](../install/install_linux.md#determine_which_tensorflow_to_install).
 
 ## Using TensorFlow with JDK
 
@@ -148,15 +148,15 @@
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
      the help of GPU(s). To help you decide, read the section entitled
      "Determine which TensorFlow to install" in one of the following guides:
 
-     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+     * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+     * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   3. Download and extract the appropriate Java Native Interface (JNI)
      file for your operating system and processor support by running the
@@ -167,7 +167,7 @@
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.10.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.10.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,10 +175,10 @@
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.10.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.10.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.10.0.zip).
   3. Extract this .zip file.
 
 __Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
@@ -227,7 +227,7 @@
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.10.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.10.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.10.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.10.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.10.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.10.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 005ad43..5fcfa4b 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -436,7 +436,7 @@
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 
@@ -520,7 +520,7 @@
 
 To use a GPU with CUDA Compute Capability 3.0, or different versions of the
 preceding NVIDIA libraries see
-@{$install_sources$installing TensorFlow from Sources}. If using Ubuntu 16.04
+[installing TensorFlow from Sources](../install/install_sources.md). If using Ubuntu 16.04
 and possibly other Debian based linux distros, `apt-get` can be used with the
 NVIDIA repository to simplify installation.
 
@@ -650,13 +650,13 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp27-none-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,13 +667,13 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -684,13 +684,13 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -701,13 +701,13 @@
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.10.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 3a8637b..c4d63cc 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -517,7 +517,7 @@
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py2-none-any.whl
 </pre>
 
 
@@ -525,5 +525,5 @@
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.10.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_raspbian.md b/tensorflow/docs_src/install/install_raspbian.md
index 58a5285..cf6b6b4 100644
--- a/tensorflow/docs_src/install/install_raspbian.md
+++ b/tensorflow/docs_src/install/install_raspbian.md
@@ -60,7 +60,7 @@
 installed yet. To install if for the first time, run:
 
 <pre>$ sudo apt-get install python3-pip # for Python 3.n
-sudo apt-get install python-pip # for Python 2.7</pre>
+$ sudo apt-get install python-pip # for Python 2.7</pre>
 
 You can find more help on installing and upgrading pip in
 [the Raspberry Pi documentation](https://www.raspberrypi.org/documentation/linux/software/python.md).
@@ -78,8 +78,8 @@
 Assuming the prerequisite software is installed on your Pi, install TensorFlow
 by invoking **one** of the following commands:
 
-     <pre> $ <b>pip3 install tensorflow</b>     # Python 3.n
-     $ <b>pip install tensorflow</b>      # Python 2.7</pre>
+<pre>$ <b>pip3 install tensorflow</b>     # Python 3.n
+$ <b>pip install tensorflow</b>      # Python 2.7</pre>
 
 This can take some time on certain platforms like the Pi Zero, where some Python
 packages like scipy that TensorFlow depends on need to be compiled before the
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index a7c0b69..e8e1314 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -168,6 +168,7 @@
 To build TensorFlow, you must install the following packages:
 
 *   six
+*   mock
 *   numpy, which is a numerical processing package that TensorFlow requires.
 *   wheel, which enables you to manage Python compressed packages in the wheel
     (.whl) format.
@@ -179,13 +180,16 @@
 
 After installing pip, invoke the following commands:
 
-<pre> $ <b>sudo pip install six numpy wheel</b> </pre>
+<pre> $ <b>sudo pip install six numpy wheel mock h5py</b>
+ $ <b>sudo pip install keras_applications==1.0.4 --no-deps</b>
+ $ <b>sudo pip install keras_preprocessing==1.0.2 --no-deps</b>
+</pre>
 
 Note: These are just the minimum requirements to _build_ tensorflow. Installing
 the pip package will download additional packages required to _run_ it. If you
 plan on executing tasks directly with `bazel` , without the pip installation,
 you may need to install additional python packages. For example, you should `pip
-install mock enum34` before running TensorFlow's tests with bazel.
+install enum34` before running TensorFlow's tests with bazel.
 
 <a name="ConfigureInstallation"></a>
 
@@ -360,6 +364,8 @@
 If RAM is an issue on your system, you may limit RAM usage by specifying
 <code>--local_resources 2048,.5,1.0</code> while invoking `bazel`.
 
+### Run the build_pip_package script
+
 The <code>bazel build</code> command builds a script named `build_pip_package`.
 Running this script as follows will build a `.whl` file within the
 `/tmp/tensorflow_pkg` directory:
@@ -374,10 +380,10 @@
 file depends on your platform. For example, the following command will install
 the pip package
 
-for TensorFlow 1.10.0rc1 on Linux:
+for TensorFlow 1.10.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.10.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.10.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/install/install_sources_windows.md b/tensorflow/docs_src/install/install_sources_windows.md
new file mode 100644
index 0000000..a1da122
--- /dev/null
+++ b/tensorflow/docs_src/install/install_sources_windows.md
@@ -0,0 +1,320 @@
+# Install TensorFlow from Sources on Windows
+
+This guide explains how to build TensorFlow sources into a TensorFlow binary and
+how to install that TensorFlow binary on Windows.
+
+## Determine which TensorFlow to install
+
+You must choose one of the following types of TensorFlow to build and install:
+
+*   **TensorFlow with CPU support only**. If your system does not have a NVIDIA®
+    GPU, build and install this version. Note that this version of TensorFlow is
+    typically easier to build and install, so even if you have an NVIDIA GPU, we
+    recommend building and installing this version first.
+*   **TensorFlow with GPU support**. TensorFlow programs typically run
+    significantly faster on a GPU than on a CPU. Therefore, if your system has a
+    NVIDIA GPU and you need to run performance-critical applications, you should
+    ultimately build and install this version. Beyond the NVIDIA GPU itself,
+    your system must also fulfill the NVIDIA software requirements described in
+    the following document:
+
+    *   [Installing TensorFlow on Windows](install_windows.md#NVIDIARequirements)
+
+## Prepare environment for Windows
+
+Before building TensorFlow on Windows, install the following build tools on your
+system:
+
+*   [MSYS2](#InstallMSYS2)
+*   [Visual C++ build tools](#InstallVCBuildTools)
+*   [Bazel for Windows](#InstallBazel)
+*   [TensorFlow Python dependencies](#InstallPython)
+*   [optionally, NVIDIA packages to support TensorFlow for GPU](#InstallCUDA)
+
+<a name="InstallMSYS2"></a>
+
+### Install MSYS2
+
+Bash bin tools are used in TensorFlow Bazel build, you can install them through [MSYS2](https://www.msys2.org/).
+
+Assume you installed MSYS2 at `C:\msys64`, add `C:\msys64\usr\bin` to your `%PATH%` environment variable.
+
+To install necessary bash bin tools, issue the following command under `cmd.exe`:
+
+<pre>
+C:\> <b>pacman -S git patch unzip</b>
+</pre>
+
+<a name="InstallVCBuildTools"></a>
+
+### Install Visual C++ Build Tools 2015
+
+To build TensorFlow, you need to install Visual C++ build tools 2015. It is a part of Visual Studio 2015.
+But you can install it separately by the following way:
+
+  * Open the [official downloand page](https://visualstudio.microsoft.com/vs/older-downloads/).
+  * Go to <b>Redistributables and Build Tools</b> section.
+  * Find <b>Microsoft Build Tools 2015 Update 3</b> and click download.
+  * Run the installer.
+
+It's possible to build TensorFlow with newer version of Visual C++ build tools,
+but we only test against Visual Studio 2015 Update 3.
+
+<a name="InstallBazel"></a>
+
+### Install Bazel
+
+If bazel is not installed on your system, install it now by following
+[these instructions](https://docs.bazel.build/versions/master/install-windows.html).
+It is recommended to use a Bazel version >= `0.15.0`.
+
+Add the directory where you installed Bazel to your `%PATH%` environment variable.
+
+<a name="InstallPython"></a>
+
+### Install TensorFlow Python dependencies
+
+If you don't have Python 3.5 or Python 3.6 installed, install it now:
+
+  * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
+  * [Python 3.6.x 64-bit from python.org](https://www.python.org/downloads/release/python-362/)
+
+To build and install TensorFlow, you must install the following python packages:
+
+*   `six`, which provides simple utilities for wrapping over differences between
+    Python 2 and Python 3.
+*   `numpy`, which is a numerical processing package that TensorFlow requires.
+*   `wheel`, which enables you to manage Python compressed packages in the wheel
+    (.whl) format.
+*   `keras_applications`, the applications module of the Keras deep learning library.
+*   `keras_preprocessing`, the data preprocessing and data augmentation module
+    of the Keras deep learning library.
+
+Assume you already have `pip3` in `%PATH%`, issue the following command:
+
+<pre>
+C:\> <b>pip3 install six numpy wheel</b>
+C:\> <b>pip3 install keras_applications==1.0.4 --no-deps</b>
+C:\> <b>pip3 install keras_preprocessing==1.0.2 --no-deps</b>
+</pre>
+
+<a name="InstallCUDA"></a>
+
+### Optional: install TensorFlow for GPU prerequisites
+
+If you are building TensorFlow without GPU support, skip this section.
+
+The following NVIDIA® _hardware_ must be installed on your system:
+
+*   GPU card with CUDA Compute Capability 3.5 or higher. See
+    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a list of
+    supported GPU cards.
+
+The following NVIDIA® _software_ must be installed on your system:
+
+*   [GPU drivers](http://nvidia.com/driver). CUDA 9.0 requires 384.x or higher.
+*   [CUDA Toolkit](http://nvidia.com/cuda) (>= 8.0). We recommend version 9.0.
+*   [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 6.0). We recommend
+    version 7.1.x.
+*   [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but
+    you also need to append its path to `%PATH%` environment
+    variable.
+
+Assume you have CUDA Toolkit installed at `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`
+and cuDNN at `C:\tools\cuda`, issue the following commands.
+
+<pre>
+C:\> SET PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;%PATH%
+C:\> SET PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64;%PATH%
+C:\> SET PATH=C:\tools\cuda\bin;%PATH%
+</pre>
+
+## Clone the TensorFlow repository
+
+Now you need to clone **the latest** TensorFlow repository,
+thanks to MSYS2 we already have `git` avaiable, issue the following command:
+
+<pre>C:\> <b>git clone https://github.com/tensorflow/tensorflow.git</b> </pre>
+
+The preceding <code>git clone</code> command creates a subdirectory named
+`tensorflow`. After cloning, you may optionally build a **specific branch**
+(such as a release branch) by invoking the following commands:
+
+<pre>
+C:\> <b>cd tensorflow</b>
+C:\> <b>git checkout</b> <i>Branch</i> # where <i>Branch</i> is the desired branch
+</pre>
+
+For example, to work with the `r1.11` release instead of the master release,
+issue the following command:
+
+<pre>C:\> <b>git checkout r1.11</b></pre>
+
+Next, you must now configure the installation.
+
+## Configure the installation
+
+The root of the source tree contains a python script named <code>configure.py</code>.
+This script asks you to identify the pathname of all relevant TensorFlow
+dependencies and specify other build configuration options such as compiler
+flags. You must run this script *prior* to creating the pip package and
+installing TensorFlow.
+
+If you wish to build TensorFlow with GPU, `configure.py` will ask you to specify
+the version numbers of CUDA and cuDNN. If several versions of CUDA or cuDNN are
+installed on your system, explicitly select the desired version instead of
+relying on the default.
+
+One of the questions that `configure.py` will ask is as follows:
+
+<pre>
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is /arch:AVX]:
+</pre>
+
+Here is an example execution of the `configure.py` script. Note that your own input
+will likely differ from our sample input:
+
+<pre>
+C:\> <b>cd tensorflow</b>  # cd to the top-level directory created
+C:\tensorflow> <b>python ./configure.py</b>
+Starting local Bazel server and connecting to it...
+................
+You have bazel 0.15.0 installed.
+Please specify the location of python. [Default is C:\python36\python.exe]: 
+
+Found possible Python library paths:
+  C:\python36\lib\site-packages
+Please input the desired Python library path to use.  Default is [C:\python36\lib\site-packages]
+
+Do you wish to build TensorFlow with CUDA support? [y/N]: <b>Y</b>
+CUDA support will be enabled for TensorFlow.
+
+Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]:
+
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0]:
+
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7.0</b>
+
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0]: <b>C:\tools\cuda</b>
+
+Please specify a list of comma-separated Cuda compute capabilities you want to build with.
+You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
+Please note that each additional compute capability significantly increases your build time and binary size. [Default is: 3.5,7.0]: <b>3.7</b>
+
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is /arch:AVX]: 
+
+Would you like to override eigen strong inline for some C++ compilation to reduce the compilation time? [Y/n]:
+Eigen strong inline overridden.
+
+Configuration finished
+</pre>
+
+## Build the pip package
+
+### CPU-only support
+
+To build a pip package for TensorFlow with CPU-only support:
+
+<pre>
+C:\tensorflow> <b>bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package</b>
+</pre>
+
+### GPU support
+
+To build a pip package for TensorFlow with GPU support:
+
+<pre>
+C:\tensorflow> <b>bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package</b>
+</pre>
+
+**NOTE :** When building with GPU support, you might want to add `--copt=-nvcc_options=disable-warnings`
+to suppress nvcc warning messages.
+
+The `bazel build` command builds a binary named `build_pip_package`
+(an executable binary to launch bash and run a bash script to create the pip package).
+Running this binary as follows will build a `.whl` file within the `C:/tmp/tensorflow_pkg` directory:
+
+<pre>
+C:\tensorflow> <b>bazel-bin\tensorflow\tools\pip_package\build_pip_package C:/tmp/tensorflow_pkg</b>
+</pre>
+
+## Install the pip package
+
+Invoke `pip3 install` to install that pip package. The filename of the `.whl`
+file depends on the TensorFlow version and your platform. For example, the
+following command will install the pip package for TensorFlow 1.11.0rc0:
+
+<pre>
+C:\tensorflow> <b>pip3 install C:/tmp/tensorflow_pkg/tensorflow-1.11.0rc0-cp36-cp36m-win_amd64.whl</b>
+</pre>
+
+## Validate your installation
+
+Validate your TensorFlow installation by doing the following:
+
+Start a terminal.
+
+Change directory (`cd`) to any directory on your system other than the
+`tensorflow` subdirectory from which you invoked the `configure` command.
+
+Invoke python:
+
+<pre>$ <b>python</b></pre>
+
+Enter the following short program inside the python interactive shell:
+
+```python
+# Python
+import tensorflow as tf
+hello = tf.constant('Hello, TensorFlow!')
+sess = tf.Session()
+print(sess.run(hello))
+```
+
+If the system outputs the following, then you are ready to begin writing
+TensorFlow programs:
+
+<pre>Hello, TensorFlow!</pre>
+
+To learn more, see the [TensorFlow tutorials](../tutorials/).
+
+## Build under MSYS shell
+The above instruction assumes you are building under the Windows native command line (`cmd.exe`), but you can also
+build TensorFlow from MSYS shell. There are a few things to notice:
+
+*   Disable the path conversion heuristic in MSYS. MSYS automatically converts arguments that look
+    like a Unix path to Windows path when running a program, this will confuse Bazel.
+    (eg. A Bazel label `//foo/bar:bin` is considered a Unix absolute path, only because it starts with a slash)
+
+  ```sh
+$ export MSYS_NO_PATHCONV=1
+$ export MSYS2_ARG_CONV_EXCL="*"
+```
+
+*   Add the directory where you install Bazel in `$PATH`. Assume you have Bazel
+    installed at `C:\tools\bazel.exe`, issue the following command:
+
+  ```sh
+# `:` is used as path separator, so we have to convert the path to Unix style.
+$ export PATH="/c/tools:$PATH"
+```
+
+*   Add the directory where you install Python in `$PATH`. Assume you have
+    Python installed at `C:\Python36\python.exe`, issue the following command:
+
+  ```sh
+$ export PATH="/c/Python36:$PATH"
+```
+
+*   If you have Python in `$PATH`, you can run configure script just by
+    `./configure`, a shell script will help you invoke python.
+
+*   (For GPU build only) Add Cuda and cuDNN bin directories in `$PATH` in the following way:
+
+  ```sh
+$ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/bin:$PATH"
+$ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/extras/CUPTI/libx64:$PATH"
+$ export PATH="/c/tools/cuda/bin:$PATH"
+```
+
+The rest steps should be the same as building under `cmd.exe`.
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index e9061bf..0bb0e5a 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -24,6 +24,8 @@
     and you need to run performance-critical applications, you should
     ultimately install this version.
 
+<a name="NVIDIARequirements"></a>
+
 ### Requirements to run TensorFlow with GPU support
 
 If you are installing TensorFlow with GPU support using one of the mechanisms
diff --git a/tensorflow/docs_src/install/leftnav_files b/tensorflow/docs_src/install/leftnav_files
index ace275c..59292f7 100644
--- a/tensorflow/docs_src/install/leftnav_files
+++ b/tensorflow/docs_src/install/leftnav_files
@@ -6,6 +6,7 @@
 install_windows.md: Windows
 install_raspbian.md: Raspbian
 install_sources.md: From source
+install_sources_windows.md: From source on Windows
 >>>
 migration.md
 
diff --git a/tensorflow/docs_src/performance/datasets_performance.md b/tensorflow/docs_src/performance/datasets_performance.md
index 46b43b7..5d9e4ba 100644
--- a/tensorflow/docs_src/performance/datasets_performance.md
+++ b/tensorflow/docs_src/performance/datasets_performance.md
@@ -38,9 +38,9 @@
 as an ETL process provides structure that facilitates the application of
 performance optimizations.
 
-When using the @{tf.estimator.Estimator} API, the first two phases (Extract and
+When using the `tf.estimator.Estimator` API, the first two phases (Extract and
 Transform) are captured in the `input_fn` passed to
-@{tf.estimator.Estimator.train}. In code, this might look like the following
+`tf.estimator.Estimator.train`. In code, this might look like the following
 (naive, sequential) implementation:
 
 ```
@@ -99,7 +99,7 @@
 ![with pipelining](/images/datasets_with_pipelining.png)
 
 The `tf.data` API provides a software pipelining mechanism through the
-@{tf.data.Dataset.prefetch} transformation, which can be used to decouple the
+`tf.data.Dataset.prefetch` transformation, which can be used to decouple the
 time data is produced from the time it is consumed. In particular, the
 transformation uses a background thread and an internal buffer to prefetch
 elements from the input dataset ahead of the time they are requested. Thus, to
@@ -130,7 +130,7 @@
 ### Parallelize Data Transformation
 
 When preparing a batch, input elements may need to be pre-processed. To this
-end, the `tf.data` API offers the @{tf.data.Dataset.map} transformation, which
+end, the `tf.data` API offers the `tf.data.Dataset.map` transformation, which
 applies a user-defined function (for example, `parse_fn` from the running
 example) to each element of the input dataset. Because input elements are
 independent of one another, the pre-processing can be parallelized across
@@ -164,7 +164,7 @@
 
 Furthermore, if your batch size is in the hundreds or thousands, your pipeline
 will likely additionally benefit from parallelizing the batch creation. To this
-end, the `tf.data` API provides the @{tf.contrib.data.map_and_batch}
+end, the `tf.data` API provides the `tf.contrib.data.map_and_batch`
 transformation, which effectively "fuses" the map and batch transformations.
 
 To apply this change to our running example, change:
@@ -205,7 +205,7 @@
 not prefetched effectively.
 
 To mitigate the impact of the various data extraction overheads, the `tf.data`
-API offers the @{tf.contrib.data.parallel_interleave} transformation. Use this
+API offers the `tf.contrib.data.parallel_interleave` transformation. Use this
 transformation to parallelize the execution of and interleave the contents of
 other datasets (such as data file readers). The
 number of datasets to overlap can be specified by the `cycle_length` argument.
@@ -232,7 +232,7 @@
 The throughput of remote storage systems can vary over time due to load or
 network events. To account for this variance, the `parallel_interleave`
 transformation can optionally use prefetching. (See
-@{tf.contrib.data.parallel_interleave} for details).
+`tf.contrib.data.parallel_interleave` for details).
 
 By default, the `parallel_interleave` transformation provides a deterministic
 ordering of elements to aid reproducibility. As an alternative to prefetching
@@ -261,7 +261,7 @@
 
 ### Map and Cache
 
-The @{tf.data.Dataset.cache} transformation can cache a dataset, either in
+The `tf.data.Dataset.cache` transformation can cache a dataset, either in
 memory or on local storage. If the user-defined function passed into the `map`
 transformation is expensive, apply the cache transformation after the map
 transformation as long as the resulting dataset can still fit into memory or
@@ -281,9 +281,9 @@
 
 ### Repeat and Shuffle
 
-The @{tf.data.Dataset.repeat} transformation repeats the input data a finite (or
+The `tf.data.Dataset.repeat` transformation repeats the input data a finite (or
 infinite) number of times; each repetition of the data is typically referred to
-as an _epoch_. The @{tf.data.Dataset.shuffle} transformation randomizes the
+as an _epoch_. The `tf.data.Dataset.shuffle` transformation randomizes the
 order of the dataset's examples.
 
 If the `repeat` transformation is applied before the `shuffle` transformation,
@@ -296,7 +296,7 @@
 (`shuffle` before `repeat`) provides stronger ordering guarantees.
 
 When possible, we recommend using the fused
-@{tf.contrib.data.shuffle_and_repeat} transformation, which combines the best of
+`tf.contrib.data.shuffle_and_repeat` transformation, which combines the best of
 both worlds (good performance and strong ordering guarantees). Otherwise, we
 recommend shuffling before repeating.
 
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 131d28f..a0f26a8 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -7,18 +7,18 @@
 and train high performance models, and quantize models for the least latency
 and highest throughput for inference.
 
-  * @{$performance_guide$Performance Guide} contains a collection of best
+  * [Performance Guide](../performance/performance_guide.md) contains a collection of best
     practices for optimizing your TensorFlow code.
 
-  * @{$datasets_performance$Data input pipeline guide} describes the tf.data
+  * [Data input pipeline guide](../performance/datasets_performance.md) describes the tf.data
     API for building efficient data input pipelines for TensorFlow.
 
-  * @{$performance/benchmarks$Benchmarks} contains a collection of
+  * [Benchmarks](../performance/benchmarks.md) contains a collection of
     benchmark results for a variety of hardware configurations.
 
   * For improving inference efficiency on mobile and
     embedded hardware, see
-    @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
+    [How to Quantize Neural Networks with TensorFlow](../performance/quantization.md), which
     explains how to use quantization to reduce model size, both in storage
     and at runtime.
 
@@ -31,20 +31,20 @@
 algebra that optimizes TensorFlow computations. The following guides explore
 XLA:
 
-  * @{$xla$XLA Overview}, which introduces XLA.
-  * @{$broadcasting$Broadcasting Semantics}, which describes XLA's
+  * [XLA Overview](../performance/xla/index.md), which introduces XLA.
+  * [Broadcasting Semantics](../performance/xla/broadcasting.md), which describes XLA's
     broadcasting semantics.
-  * @{$developing_new_backend$Developing a new back end for XLA}, which
+  * [Developing a new back end for XLA](../performance/xla/developing_new_backend.md), which
     explains how to re-target TensorFlow in order to optimize the performance
     of the computational graph for particular hardware.
-  * @{$jit$Using JIT Compilation}, which describes the XLA JIT compiler that
+  * [Using JIT Compilation](../performance/xla/jit.md), which describes the XLA JIT compiler that
     compiles and runs parts of TensorFlow graphs via XLA in order to optimize
     performance.
-  * @{$operation_semantics$Operation Semantics}, which is a reference manual
+  * [Operation Semantics](../performance/xla/operation_semantics.md), which is a reference manual
     describing the semantics of operations in the `ComputationBuilder`
     interface.
-  * @{$shapes$Shapes and Layout}, which details the `Shape` protocol buffer.
-  * @{$tfcompile$Using AOT compilation}, which explains `tfcompile`, a
+  * [Shapes and Layout](../performance/xla/shapes.md), which details the `Shape` protocol buffer.
+  * [Using AOT compilation](../performance/xla/tfcompile.md), which explains `tfcompile`, a
     standalone tool that compiles TensorFlow graphs into executable code in
     order to optimize performance.
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index dafacbe..9ea1d6a 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -41,7 +41,7 @@
     utilization is not approaching 80-100%, then the input pipeline may be the
     bottleneck.
 *   Generate a timeline and look for large blocks of white space (waiting). An
-    example of generating a timeline exists as part of the @{$jit$XLA JIT}
+    example of generating a timeline exists as part of the [XLA JIT](../performance/xla/jit.md)
     tutorial.
 *   Check CPU usage. It is possible to have an optimized input pipeline and lack
     the CPU cycles to process the pipeline.
@@ -68,7 +68,7 @@
 
 #### Using the tf.data API
 
-The @{$datasets$tf.data API} is replacing `queue_runner` as the recommended API
+The [tf.data API](../guide/datasets.md) is replacing `queue_runner` as the recommended API
 for building input pipelines. This
 [ResNet example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/cifar10_main.py)
 ([arXiv:1512.03385](https://arxiv.org/abs/1512.03385))
@@ -78,7 +78,7 @@
 The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
 than the Python-based `queue_runner` that is limited by Python's multi-threading
 performance. A detailed performance guide for the `tf.data` API can be found
-@{$datasets_performance$here}.
+[here](../performance/datasets_performance.md).
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
 general `feed_dict` does not provide a scalable solution. If only a single GPU
@@ -94,7 +94,7 @@
 #### Fused decode and crop
 
 If inputs are JPEG images that also require cropping, use fused
-@{tf.image.decode_and_crop_jpeg} to speed up preprocessing.
+`tf.image.decode_and_crop_jpeg` to speed up preprocessing.
 `tf.image.decode_and_crop_jpeg` only decodes the part of
 the image within the crop window. This significantly speeds up the process if
 the crop window is much smaller than the full image. For imagenet data, this
@@ -174,7 +174,7 @@
 ### Common fused Ops
 
 Fused Ops combine multiple operations into a single kernel for improved
-performance. There are many fused Ops within TensorFlow and @{$xla$XLA} will
+performance. There are many fused Ops within TensorFlow and [XLA](../performance/xla/index.md) will
 create fused Ops when possible to automatically improve performance. Collected
 below are select fused Ops that can greatly improve performance and may be
 overlooked.
@@ -187,14 +187,14 @@
 norm can result in a 12%-30% speedup.
 
 There are two commonly used batch norms and both support fusing. The core
-@{tf.layers.batch_normalization} added fused starting in TensorFlow 1.3.
+`tf.layers.batch_normalization` added fused starting in TensorFlow 1.3.
 
 ```python
 bn = tf.layers.batch_normalization(
     input_layer, fused=True, data_format='NCHW')
 ```
 
-The contrib @{tf.contrib.layers.batch_norm} method has had fused as an option
+The contrib `tf.contrib.layers.batch_norm` method has had fused as an option
 since before TensorFlow 1.0.
 
 ```python
@@ -205,43 +205,43 @@
 
 There are many ways to specify an RNN computation in TensorFlow and they have
 trade-offs with respect to model flexibility and performance. The
-@{tf.nn.rnn_cell.BasicLSTMCell} should be considered a reference implementation
+`tf.nn.rnn_cell.BasicLSTMCell` should be considered a reference implementation
 and used only as a last resort when no other options will work.
 
 When using one of the cells, rather than the fully fused RNN layers, you have a
-choice of whether to use @{tf.nn.static_rnn} or @{tf.nn.dynamic_rnn}.  There
+choice of whether to use `tf.nn.static_rnn` or `tf.nn.dynamic_rnn`.  There
 shouldn't generally be a performance difference at runtime, but large unroll
-amounts can increase the graph size of the @{tf.nn.static_rnn} and cause long
-compile times.  An additional advantage of @{tf.nn.dynamic_rnn} is that it can
+amounts can increase the graph size of the `tf.nn.static_rnn` and cause long
+compile times.  An additional advantage of `tf.nn.dynamic_rnn` is that it can
 optionally swap memory from the GPU to the CPU to enable training of very long
 sequences.  Depending on the model and hardware configuration, this can come at
 a performance cost.  It is also possible to run multiple iterations of
-@{tf.nn.dynamic_rnn} and the underlying @{tf.while_loop} construct in parallel,
+`tf.nn.dynamic_rnn` and the underlying `tf.while_loop` construct in parallel,
 although this is rarely useful with RNN models as they are inherently
 sequential.
 
-On NVIDIA GPUs, the use of @{tf.contrib.cudnn_rnn} should always be preferred
+On NVIDIA GPUs, the use of `tf.contrib.cudnn_rnn` should always be preferred
 unless you want layer normalization, which it doesn't support.  It is often at
-least an order of magnitude faster than @{tf.contrib.rnn.BasicLSTMCell} and
-@{tf.contrib.rnn.LSTMBlockCell} and uses 3-4x less memory than
-@{tf.contrib.rnn.BasicLSTMCell}.
+least an order of magnitude faster than `tf.contrib.rnn.BasicLSTMCell` and
+`tf.contrib.rnn.LSTMBlockCell` and uses 3-4x less memory than
+`tf.contrib.rnn.BasicLSTMCell`.
 
 If you need to run one step of the RNN at a time, as might be the case in
 reinforcement learning with a recurrent policy, then you should use the
-@{tf.contrib.rnn.LSTMBlockCell} with your own environment interaction loop
-inside a @{tf.while_loop} construct. Running one step of the RNN at a time and
+`tf.contrib.rnn.LSTMBlockCell` with your own environment interaction loop
+inside a `tf.while_loop` construct. Running one step of the RNN at a time and
 returning to Python is possible, but it will be slower.
 
-On CPUs, mobile devices, and if @{tf.contrib.cudnn_rnn} is not available on
+On CPUs, mobile devices, and if `tf.contrib.cudnn_rnn` is not available on
 your GPU, the fastest and most memory efficient option is
-@{tf.contrib.rnn.LSTMBlockFusedCell}.
+`tf.contrib.rnn.LSTMBlockFusedCell`.
 
-For all of the less common cell types like @{tf.contrib.rnn.NASCell},
-@{tf.contrib.rnn.PhasedLSTMCell}, @{tf.contrib.rnn.UGRNNCell},
-@{tf.contrib.rnn.GLSTMCell}, @{tf.contrib.rnn.Conv1DLSTMCell},
-@{tf.contrib.rnn.Conv2DLSTMCell}, @{tf.contrib.rnn.LayerNormBasicLSTMCell},
+For all of the less common cell types like `tf.contrib.rnn.NASCell`,
+`tf.contrib.rnn.PhasedLSTMCell`, `tf.contrib.rnn.UGRNNCell`,
+`tf.contrib.rnn.GLSTMCell`, `tf.contrib.rnn.Conv1DLSTMCell`,
+`tf.contrib.rnn.Conv2DLSTMCell`, `tf.contrib.rnn.LayerNormBasicLSTMCell`,
 etc., one should be aware that they are implemented in the graph like
-@{tf.contrib.rnn.BasicLSTMCell} and as such will suffer from the same poor
+`tf.contrib.rnn.BasicLSTMCell` and as such will suffer from the same poor
 performance and high memory usage.  One should consider whether or not those
 trade-offs are worth it before using these cells. For example, while layer
 normalization can speed up convergence, because cuDNN is 20x faster the fastest
@@ -257,7 +257,7 @@
 in [Comparing compiler optimizations](#comparing-compiler-optimizations).
 
 To install the most optimized version of TensorFlow,
-@{$install_sources$build and install} from source. If there is a need to build
+[build and install](../install/install_sources.md) from source. If there is a need to build
 TensorFlow on a platform that has different hardware than the target, then
 cross-compile with the highest optimizations for the target platform. The
 following command is an example of using `bazel` to compile for a specific
@@ -298,7 +298,7 @@
 gradients are applied has an impact on the performance, scaling, and convergence
 of the model.  The rest of this section provides an overview of variable
 placement and the towering of a model on multiple GPUs.
-@{$performance_models$High-Performance Models} gets into more details regarding
+[High-Performance Models](../performance/performance_models.md) gets into more details regarding
 more complex methods that can be used to share and update variables between
 towers.
 
@@ -307,7 +307,7 @@
 systems can be built with NVIDIA Tesla P100s but one may be using PCIe and the
 other [NVLink](http://www.nvidia.com/object/nvlink.html). In that scenario, the
 optimal solution for each system may be different. For real world examples, read
-the @{$performance/benchmarks$benchmark} page which details the settings that
+the [benchmark](../performance/benchmarks.md) page which details the settings that
 were optimal for a variety of platforms. Below is a summary of what was learned
 from benchmarking various platforms and configurations:
 
@@ -433,7 +433,7 @@
 ## Optimizing for CPU
 
 CPUs, which includes Intel® Xeon Phi™, achieve optimal performance when
-TensorFlow is @{$install_sources$built from source} with all of the instructions
+TensorFlow is [built from source](../install/install_sources.md) with all of the instructions
 supported by the target CPU.
 
 Beyond using the latest instruction sets, Intel® has added support for the
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 359b0e9..151c0b2 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -9,9 +9,9 @@
 
 ## Input Pipeline
 
-The @{$performance_guide$Performance Guide} explains how to identify possible
-input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
-and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
+The [Performance Guide](../performance/performance_guide.md) explains how to identify possible
+input pipeline issues and best practices. We found that using `tf.FIFOQueue`
+and `tf.train.queue_runner` could not saturate multiple current generation GPUs
 when using large inputs and processing with higher samples per second, such
 as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
 This is due to the use of Python threads as its underlying implementation. The
@@ -29,7 +29,7 @@
 
 The dominant part of each stage is executed in parallel with the other stages
 using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
-similar to @{tf.FIFOQueue}. The difference is that `StagingArea`  does not
+similar to `tf.FIFOQueue`. The difference is that `StagingArea`  does not
 guarantee FIFO ordering, but offers simpler functionality and can be executed
 on both CPU and GPU in parallel with other stages. Breaking the input pipeline
 into 3 stages that operate independently in parallel is scalable and takes full
@@ -62,10 +62,10 @@
 image decoding, distortion, and resizing.
 
 Once the images are through preprocessing, they are concatenated together into 8
-tensors each with a batch-size of 32. Rather than using @{tf.concat} for this
+tensors each with a batch-size of 32. Rather than using `tf.concat` for this
 purpose, which is implemented as a single op that waits for all the inputs to be
-ready before concatenating them together, @{tf.parallel_stack} is used.
-@{tf.parallel_stack} allocates an uninitialized tensor as an output, and each
+ready before concatenating them together, `tf.parallel_stack` is used.
+`tf.parallel_stack` allocates an uninitialized tensor as an output, and each
 input tensor is written to its designated portion of the output tensor as soon
 as the input is available.
 
@@ -94,7 +94,7 @@
 
 With all the stages capable of being driven by different processors,
 `data_flow_ops.StagingArea` is used between them so they run in parallel.
-`StagingArea` is a queue-like operator similar to @{tf.FIFOQueue} that offers
+`StagingArea` is a queue-like operator similar to `tf.FIFOQueue` that offers
 simpler functionalities that can be executed on both CPU and GPU.
 
 Before the model starts running all the stages, the input pipeline stages are
@@ -153,7 +153,7 @@
 The default batch-normalization in TensorFlow is implemented as composite
 operations. This is very general, but often leads to suboptimal performance. An
 alternative is to use fused batch-normalization which often has much better
-performance on GPU. Below is an example of using @{tf.contrib.layers.batch_norm}
+performance on GPU. Below is an example of using `tf.contrib.layers.batch_norm`
 to implement fused batch-normalization.
 
 ```python
@@ -301,7 +301,7 @@
 within the same host machine, we can use the default TensorFlow implicit copy
 mechanism.
 
-However, we can instead use the optional NCCL (@{tf.contrib.nccl}) support. NCCL
+However, we can instead use the optional NCCL (`tf.contrib.nccl`) support. NCCL
 is an NVIDIA® library that can efficiently broadcast and aggregate data across
 different GPUs. It schedules a cooperating kernel on each GPU that knows how to
 best utilize the underlying hardware topology; this kernel uses a single SM of
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index c97f741..3326d82 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -80,7 +80,7 @@
 TensorFlow can train models with quantization in the loop. Because training
 requires small gradient adjustments, floating point values are still used. To
 keep models as floating point while adding the quantization error in the training
-loop, @{$array_ops#Fake_quantization$fake quantization} nodes simulate the
+loop, [fake quantization](../api_guides/python/array_ops.md#Fake_quantization) nodes simulate the
 effect of quantization in the forward and backward passes.
 
 Since it's difficult to add these fake quantization operations to all the
@@ -163,7 +163,7 @@
   --std_value=127.5 --mean_value=127.5
 ```
 
-See the documentation for @{tf.contrib.quantize} and
+See the documentation for `tf.contrib.quantize` and
 [TensorFlow Lite](/mobile/tflite/).
 
 ## Quantized accuracy
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 8f5de83..770737c 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -14,7 +14,7 @@
 algebra that optimizes TensorFlow computations. The results are improvements in
 speed, memory usage, and portability on server and mobile platforms. Initially,
 most users will not see large benefits from XLA, but are welcome to experiment
-by using XLA via @{$jit$just-in-time (JIT) compilation} or @{$tfcompile$ahead-of-time (AOT) compilation}. Developers targeting new hardware accelerators are
+by using XLA via [just-in-time (JIT) compilation](../../performance/xla/jit.md) or [ahead-of-time (AOT) compilation](../../performance/xla/tfcompile.md). Developers targeting new hardware accelerators are
 especially encouraged to try out XLA.
 
 The XLA framework is experimental and in active development. In particular,
@@ -54,13 +54,13 @@
 
 The input language to XLA is called "HLO IR", or just HLO (High Level
 Optimizer). The semantics of HLO are described on the
-@{$operation_semantics$Operation Semantics} page. It
+[Operation Semantics](../../performance/xla/operation_semantics.md) page. It
 is most convenient to think of HLO as a [compiler
 IR](https://en.wikipedia.org/wiki/Intermediate_representation).
 
 XLA takes graphs ("computations") defined in HLO and compiles them into machine
 instructions for various architectures. XLA is modular in the sense that it is
-easy to slot in an alternative backend to @{$developing_new_backend$target some novel HW architecture}. The CPU backend for x64 and ARM64 as
+easy to slot in an alternative backend to [target some novel HW architecture](../../performance/xla/developing_new_backend.md). The CPU backend for x64 and ARM64 as
 well as the NVIDIA GPU backend are in the TensorFlow source tree.
 
 The following diagram shows the compilation process in XLA:
@@ -94,5 +94,5 @@
 
 ## Supported Platforms
 
-XLA currently supports @{$jit$JIT compilation} on x86-64 and NVIDIA GPUs; and
-@{$tfcompile$AOT compilation} for x86-64 and ARM.
+XLA currently supports [JIT compilation](../../performance/xla/jit.md) on x86-64 and NVIDIA GPUs; and
+[AOT compilation](../../performance/xla/tfcompile.md) for x86-64 and ARM.
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index edc777a..2de30d1 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -13,6 +13,79 @@
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## AllToAll
+
+See also
+[`XlaBuilder::AllToAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Alltoall is a collective operation that sends data from all cores to all cores.
+It has two phases:
+
+1.  the scatter phase. On each core, the operand is split into `split_count`
+    number of blocks along the `split_dimensions`, and the blocks are scattered
+    to all cores, e.g., the ith block is send to the ith core.
+2.  the gather phase. Each core concatenates the received blocks along the
+    `concat_dimension`.
+
+The participating cores can be configured by:
+
+-   `replica_groups`: each ReplicaGroup contains a list of replica id. If empty,
+    all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
+    applied within subgroups in the specified order. For example, replica
+    groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
+    1, 2, 3, and in the gather phase, the received blocks will be concatenated
+    in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
+    5, 0, and the concatenation order is 4, 5, 0.
+
+Prerequisites:
+
+-   The dimension size of the operand on the split_dimension is divisible by
+    split_count.
+-   The operand's shape is not tuple.
+
+<b> `AllToAll(operand, split_dimension, concat_dimension, split_count,
+replica_groups)` </b>
+
+
+| Arguments          | Type                  | Semantics                       |
+| ------------------ | --------------------- | ------------------------------- |
+| `operand`          | `XlaOp`               | n dimensional input array       |
+| `split_dimension`  | `int64`               | A value in the interval `[0,    |
+:                    :                       : n)` that names the dimension    :
+:                    :                       : along which the operand is      :
+:                    :                       : split                           :
+| `concat_dimension` | `int64`               | a value in the interval `[0,    |
+:                    :                       : n)` that names the dimension    :
+:                    :                       : along which the split blocks    :
+:                    :                       : are concatenated                :
+| `split_count`      | `int64`               | the number of cores that        |
+:                    :                       : participate this operation. If  :
+:                    :                       : `replica_groups` is empty, this :
+:                    :                       : should be the number of         :
+:                    :                       : replicas; otherwise, this       :
+:                    :                       : should be equal to the number   :
+:                    :                       : of replicas in each group.      :
+| `replica_groups`   | `ReplicaGroup` vector | each group contains a list of   |
+:                    :                       : replica id.                     :
+
+Below shows an example of Alltoall.
+
+```
+XlaBuilder b("alltoall");
+auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
+```
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/xla/ops_alltoall.png">
+</div>
+
+In this example, there are 4 cores participating the Alltoall. On each core, the
+operand is split into 4 parts along dimension 0, so each part has shape
+f32[4,4]. The 4 parts are scattered to all cores. Then each core concatenates
+the received parts along dimension 1, in the order or core 0-4. So the output on
+each core has shape f32[16,4].
+
 ## BatchNormGrad
 
 See also
@@ -270,7 +343,7 @@
 
 See also
 [`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
-and the @{tf.reshape} operation.
+and the `tf.reshape` operation.
 
 Collapses dimensions of an array into one dimension.
 
@@ -291,7 +364,7 @@
 dimension size equal to the product of original dimension sizes. The lowest
 dimension number in `dimensions` is the slowest varying dimension (most major)
 in the loop nest which collapses these dimension, and the highest dimension
-number is fastest varying (most minor). See the @{tf.reshape} operator
+number is fastest varying (most minor). See the `tf.reshape` operator
 if more general collapse ordering is needed.
 
 For example, let v be an array of 24 elements:
@@ -432,16 +505,17 @@
 can be thought of as a n-dimensional window moving across a n-dimensional base
 area and a computation is performed for each possible position of the window.
 
-| Arguments        | Type                    | Semantics                     |
-| ---------------- | ----------------------- | ----------------------------- |
-| `lhs`            | `XlaOp`                 | rank n+2 array of inputs      |
-| `rhs`            | `XlaOp`                 | rank n+2 array of kernel      |
-:                  :                         : weights                       :
-| `window_strides` | `ArraySlice<int64>`     | n-d array of kernel strides   |
-| `padding`        | `ArraySlice<pair<int64, | n-d array of (low, high)      |
-:                  : int64>>`                : padding                       :
-| `lhs_dilation`   | `ArraySlice<int64>`     | n-d lhs dilation factor array |
-| `rhs_dilation`   | `ArraySlice<int64>`     | n-d rhs dilation factor array |
+| Arguments             | Type                 | Semantics                     |
+| --------------------- | -------------------- | ----------------------------- |
+| `lhs`                 | `XlaOp`              | rank n+2 array of inputs      |
+| `rhs`                 | `XlaOp`              | rank n+2 array of kernel      |
+:                       :                      : weights                       :
+| `window_strides`      | `ArraySlice<int64>`  | n-d array of kernel strides   |
+| `padding`             | `ArraySlice<         | n-d array of (low, high)      |
+:                       : pair<int64, int64>>` : padding                       :
+| `lhs_dilation`        | `ArraySlice<int64>`  | n-d lhs dilation factor array |
+| `rhs_dilation`        | `ArraySlice<int64>`  | n-d rhs dilation factor array |
+| `feature_group_count` | int64                | the number of feature groups  |
 
 Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
 array describing the base area. This is called the input, even though of course
@@ -459,8 +533,8 @@
 filter/kernel/window. The dimensions are, in this order:
 
 *   `output-z`: The `z` dimension of the output.
-*   `input-z`: The size of this dimension should equal the size of the `z`
-    dimension in lhs.
+*   `input-z`: The size of this dimension times `feature_group_count` should
+    equal the size of the `z` dimension in lhs.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
     window that moves across the base area.
 
@@ -490,8 +564,26 @@
 zeroes.
 
 Dilation of the rhs is also called atrous convolution. For more details, see
-@{tf.nn.atrous_conv2d}. Dilation of the lhs is also called transposed
-convolution. For more details, see @{tf.nn.conv2d_transpose}.
+`tf.nn.atrous_conv2d`. Dilation of the lhs is also called transposed
+convolution. For more details, see `tf.nn.conv2d_transpose`.
+
+The `feature_group_count` argument (default value 1) can be used for grouped
+convolutions. `feature_group_count` needs to be a divisor of both the input and
+the output feature dimension. If `feature_group_count` is greater than 1, it
+means that conceptually the input and output feature dimension and the `rhs`
+output feature dimension are split evenly into `feature_group_count` many
+groups, each group consisting of a consecutive subsequence of features. The
+input feature dimension of `rhs` needs to be equal to the `lhs` input feature
+dimension divided by `feature_group_count` (so it already has the size of a
+group of input features). The i-th groups are used together to compute
+`feature_group_count` many separate convolutions. The results of these
+convolutions are concatenated together in the output feature dimension.
+
+For depthwise convolution the `feature_group_count` argument would be set to the
+input feature dimension, and the filter would be reshaped from
+`[filter_height, filter_width, in_channels, channel_multiplier]` to
+`[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more
+details, see `tf.nn.depthwise_conv2d`.
 
 The output shape has these dimensions, in this order:
 
@@ -936,7 +1028,7 @@
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays. In this variant, operations between arrays of
 different ranks are *not* supported, unless one of the operands is a scalar.
@@ -960,7 +1052,7 @@
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
 then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
-@{$broadcasting$broadcasting page}.
+[broadcasting page](../../performance/xla/broadcasting.md).
 
 ## Element-wise comparison operations
 
@@ -983,7 +1075,7 @@
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays with the element type `PRED`. In this variant,
 operations between arrays of different ranks are *not* supported, unless one of
@@ -1000,7 +1092,7 @@
 
 The additional `broadcast_dimensions` operand is a slice of integers specifying
 the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the @{$broadcasting$broadcasting page}.
+in detail on the [broadcasting page](../../performance/xla/broadcasting.md).
 
 ## Element-wise unary functions
 
@@ -1046,7 +1138,7 @@
 ## Gather
 
 The XLA gather operation stitches together several slices (each slice at a
-potentially different runtime offset) of an input tensor into an output tensor.
+potentially different runtime offset) of an input array.
 
 ### General Semantics
 
@@ -1054,151 +1146,141 @@
 [`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
-<b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
+<b> `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` </b>
 
 |Arguments         | Type                    | Semantics                       |
 |----------------- | ----------------------- | --------------------------------|
-|`operand`         | `XlaOp`                 | The tensor we’re gathering      |
+|`operand`         | `XlaOp`                 | The array we’re gathering       |
 :                  :                         : from.                           :
-|`gather_indices`  | `XlaOp`                 | Tensor containing the starting  |
-:                  :                         : indices of the slices we're     :
-:                  :                         : stitching together into the     :
-:                  :                         : output tensor.                  :
-|`index_vector_dim`  | `int64`               | The dimension in                |
-:                  :                         : `gather_indices` that contains  :
-:                  :                         : the starting indices.           :
-|`output_window_dims` | `ArraySlice<int64>`  | The set of dimensions in the    |
-:                  :                         : output shape that are _window   :
-:                  :                         : dimensions_ (defined below).    :
-:                  :                         : Not all window dimensions may   :
-:                  :                         : be present in the output shape. :
-|`elided_window_dims` | `ArraySlice<int64>`  | The set of _window dimensions_  |
-:                  :            : that are not present in the output shape.    :
-:                  :            : `window_bounds[i]` must be `1` for all `i`   :
-:                  :            : in `elided_window_dims`.                     :
-|`window_bounds`   | `ArraySlice<int64>`    | `window_bounds[i]` is the bounds |
-:                  :            : for  window dimension `i`. This includes     :
-:                  :            : both the window dimensions that are          :
-:                  :            : explicitly part of the output shape (via     :
-:                  :            : `output_window_dims`) and the window         :
-:                  :            : dimensions that are elided (via              :
-:                  :            : `elided_window_dims`).                       :
-|`gather_dims_to_operand_dims` | `ArraySlice<int64>` | A dimension map (the    |
-:                  :            : array is interpreted as mapping `i` to       :
-:                  :            : `gather_dims_to_operand_dims[i]`)  from      :
-:                  :            : the gather indices in `gather_indices` to    :
-:                  :            : the operand index space.  It has to be       :
-:                  :            : one-to-one and total.                        :
+|`start_indices`   | `XlaOp`                 | Array containing the starting  |
+:                  :                         : indices of the slices we gather.:
+|`index_vector_dim` | `int64`                | The dimension in                |
+:                  :                         : `start_indices` that "contains" :
+:                  :                         : the starting indices.  See      :
+:                  :                         : below for a detailed            :
+:                  :                         : description.                    :
+|`offset_dims`     | `ArraySlice<int64>`     | The set of dimensions in  the   :
+:                  :                         : output shape that offset into a :
+:                  :                         : array sliced from operand.     :
+|`slice_sizes`     | `ArraySlice<int64>`      | `slice_sizes[i]` is the bounds |
+:                  :                          : for the slice on dimension `i`.:
+|`collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each  :
+|                  :                          | slice that are collapsed away. :
+|                  :                          | These dimensions must have size:
+|                  :                          | 1.                             |
+|`start_index_map` | `ArraySlice<int64>`      | A map that describes how to map|
+:                  :                          : indices in `start_indices` to  :
+:                  :                          : to legal indices into operand. :
 
-For every index `Out` in the output tensor, we compute two things (more
-precisely described later):
+For convenience, we label dimensions in the output array not in `offset_dims`
+as `batch_dims`.
 
-  - An index into `gather_indices.rank` - `1` dimensions of `gather_indices`,
-    which gives us a starting index of a slice, _operand slice_, in the operand
-    tensor.  These `gather_indices.rank` - `1` dimensions are all the dimensions
-    in `gather_indices` except `index_vector_dim`.
+The output is an array of rank `batch_dims.size` + `operand.rank` -
+`collapsed_slice_dims`.size.
 
-  - A _window index_ that has the same rank as the operand.  This index is
-    composed of the values in `Out` at dimensions `output_window_dims`, embedded
-    with zeroes according to `elided_window_dims`.
+If `index_vector_dim` is equal to `start_indices.rank` we implicitly consider
+`start_indices` to have a trailing `1` dimension (i.e. if `start_indices` was of
+shape `[6,7]` and `index_vector_dim` is `2` then we implicitly consider the
+shape of `start_indices` to be `[6,7,1]`).
 
-The _window index_ is the relative index of the element in _operand slice_ that
-should be present in the output at index `Out`.
+The bounds for the output array along dimension `i` is computed as follows:
 
-The output is a tensor of rank `output_window_dims.size` + `gather_indices.rank`
-- `1`.  Additionally, as a shorthand, we define `output_gather_dims` of type
-`ArraySlice<int64>` as the set of dimensions in the output shape but not in
-`output_window_dims`, in ascending order.  E.g. if the output tensor has rank
-`5`, `output_window_dims` is {`2`, `4`} then `output_gather_dims` is {`0`, `1`,
-`3`}
+  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
+     some `k`) then we pick the corresponding dimension bounds out of
+     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
+     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
+     `start_indices.shape.dims`[`k`+`1`] otherwise).
 
-If `index_vector_dim` is equal to `gather_indices.rank` we implicitly
-consider `gather_indices` to have a trailing `1` dimension (i.e. if
-`gather_indices` was of shape `[6,7]` and `index_vector_dim` is `2` then
-we implicitly consider the shape of `gather_indices` to be `[6,7,1]`).
+  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
+     some `k`) then we pick the corresponding bound out of `slice_sizes` after
+     accounting for `collapsed_slice_dims` (i.e. we pick
+     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
+     with the bounds at indices `collapsed_slice_dims` removed).
 
-The bounds for the output tensor along dimension `i` is computed as follows:
+Formally, the operand index `In` corresponding to an output index `Out` is
+computed as follows:
 
-  1. If `i` is present in `output_gather_dims` (i.e. is equal to
-     `output_gather_dims[k]` for some `k`) then we pick the corresponding
-     dimension bounds out of `gather_indices.shape`, skipping
-     `index_vector_dim` (i.e. pick `gather_indices.shape.dims`[`k`] if `k`
-     < `index_vector_dim` and `gather_indices.shape.dims`[`k`+`1`]
-     otherwise).
-  2. If `i` is present in `output_window_dims` (i.e. equal to
-     `output_window_dims`[`k`] for some `k`) then we pick the corresponding
-     bound out of `window_bounds` after accounting for `elided_window_dims`
-     (i.e. we pick `adjusted_window_bounds`[`k`] where `adjusted_window_bounds`
-     is `window_bounds` with the bounds at indices `elided_window_dims`
-     removed).
+  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
+     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
+     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
+     this is well defined even if `G` is empty -- if `G` is empty then `S` =
+     `start_indices`.
 
-The operand index `In` corresponding to an output index `Out` is computed as
-follows:
-
-  1. Let `G` = { `Out`[`k`] for `k` in `output_gather_dims` }.  Use `G` to slice
-     out vector `S` such that `S`[`i`] = `gather_indices`[Combine(`G`, `i`)]
-     where Combine(A, b) inserts b at position `index_vector_dim` into A.
-     Note that this is well defined even if `G` is empty -- if `G` is empty then
-     `S` = `gather_indices`.
-  2. Create an index, `S`<sub>`in`</sub>, into `operand` using `S` by
-     scattering `S` using the `gather_dims_to_operand_dims` map
-     (`S`<sub>`in`</sub> is the starting indices for _operand slice_ mentioned
-     above).  More precisely:
-       1. `S`<sub>`in`</sub>[`gather_dims_to_operand_dims`[`k`]] = `S`[`k`] if `k` <
-          `gather_dims_to_operand_dims.size`.
+  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
+     scattering `S` using `start_index_map`.  More precisely:
+       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
+          `start_index_map.size`.
        2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-  3. Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
-     at the output window dimensions in `Out` according to
-     the `elided_window_dims` set (`W`<sub>`in`</sub> is the _window index_
-     mentioned above).  More precisely:
-       1. `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `Out`[`k`] if
-          `k` < `output_window_dims.size` (`window_dims_to_operand_dims` is
-          defined below).
-       2. `W`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `In` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+
+  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
+     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
+     set.  More precisely:
+       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
+          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
+          (`expand_offset_dims` is defined below).
+       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
+  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
      addition.
 
-`window_dims_to_operand_dims` is the monotonic function with domain [`0`,
-`output_window_dims.size`) and range [`0`, `operand.rank`) \
-`elided_window_dims`.  So if, e.g., `output_window_dims.size` is `4`,
-`operand.rank` is `6` and `elided_window_dims` is {`0`, `2`} then
-`window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
+`expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
+and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
+`offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`,
+`2`} then `expand_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
 
 ### Informal Description and Examples
 
-`index_vector_dim` is set to `gather_indices.rank` - `1` in all of the
-examples that follow.  More interesting values for `index_vector_dim`
-does not change the operation fundamentally, but makes the visual representation
-more cumbersome.
+Informally, every index `Out` in the output array corresponds to an element `E`
+in the operand array, computed as follows:
+
+  - We use the batch dimensions in `Out` to look up a starting index from
+    `start_indices`.
+
+  - We use `start_index_map` to map the starting index (which may have size less
+    than operand.rank) to a "full" starting index into operand.
+
+  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
+    index.
+
+  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
+    Since all collapsed slice dimensions have to have bound 1 this reshape is
+    always legal.
+
+  - We use the offset dimensions in `Out` to index into this slice to get the
+    input element, `E`, corresponding to output index `Out`.
+
+`index_vector_dim` is set to `start_indices.rank` - `1` in all of the
+examples that follow.  More interesting values for `index_vector_dim` does not
+change the operation fundamentally, but makes the visual representation more
+cumbersome.
 
 To get an intuition on how all of the above fits together, let's look at an
-example that gathers 5 slices of shape `[8,6]` from a `[16,11]` tensor.  The
-position of a slice into the `[16,11]` tensor can be represented as an index
+example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array.  The
+position of a slice into the `[16,11]` array can be represented as an index
 vector of shape `S64[2]`, so the set of 5 positions can be represented as a
-`S64[5,2]` tensor.
+`S64[5,2]` array.
 
 The behavior of the gather operation can then be depicted as an index
-transformation that takes [`G`,`W`<sub>`0`</sub>,`W`<sub>`1`</sub>], an index in
-the output shape, and maps it to an element in the input tensor in the following
+transformation that takes [`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>], an index in
+the output shape, and maps it to an element in the input array in the following
 way:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="../../images/ops_xla_gather_0.svg">
 </div>
 
-We first select an (`X`,`Y`) vector from the gather indices tensor using `G`.
-The element in the output tensor at index
-[`G`,`W`<sub>`0`</sub>,`W`<sub>`1`</sub>] is then the element in the input
-tensor at index [`X`+`W`<sub>`0`</sub>,`Y`+`W`<sub>`1`</sub>].
+We first select an (`X`,`Y`) vector from the gather indices array using `G`.
+The element in the output array at index
+[`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>] is then the element in the input
+array at index [`X`+`O`<sub>`0`</sub>,`Y`+`O`<sub>`1`</sub>].
 
-`window_bounds` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
+`slice_sizes` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
 W<sub>`1`</sub>, and this in turn decides the bounds of the slice.
 
 This gather operation acts as a batch dynamic slice with `G` as the batch
 dimension.
 
 The gather indices may be multidimensional.  For instance, a more general
-version of the example above using a "gather indices" tensor of shape `[4,5,2]`
+version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -1206,25 +1288,25 @@
 </div>
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
-`G`<sub>`1`</sub> as the batch dimensions.  The window bounds are still `[8,6]`.
+`G`<sub>`1`</sub> as the batch dimensions.  The slice size is still `[8,6]`.
 
 The gather operation in XLA generalizes the informal semantics outlined above in
 the following ways:
 
- 1. We can configure which dimensions in the output shape are the window
-    dimensions (dimensions containing `W`<sub>`0`</sub>, `W`<sub>`1`</sub> in
-    the last example).  The output gather dimensions (dimensions containing
+ 1. We can configure which dimensions in the output shape are the offset
+    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
+    the last example).  The output batch dimensions (dimensions containing
     `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
-    the output dimensions that are not window dimensions.
+    the output dimensions that are not offset dimensions.
 
- 2. The number of output window dimensions explicitly present in the output
+ 2. The number of output offset dimensions explicitly present in the output
     shape may be smaller than the input rank.  These "missing" dimensions, which
-    are listed explicitly as `elided_window_dims`, must have a window bound of
-    `1`.  Since they have a window bound of `1` the only valid index for them is
+    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
+    `1`.  Since they have a slice size of `1` the only valid index for them is
     `0` and eliding them does not introduce ambiguity.
 
- 3. The slice extracted from the "Gather Indices" tensor ((`X`, `Y`) in the last
-    example) may have fewer elements than the input tensor rank, and an explicit
+ 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
+    example) may have fewer elements than the input array rank, and an explicit
     mapping dictates how the index should be expanded to have the same rank as
     the input.
 
@@ -1235,20 +1317,19 @@
 </div>
 
 `G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
-from the gather indices tensor as usual, except the starting index has only one
-element, `X`.  Similarly, there is only one output window index with the value
-`W`<sub>`0`</sub>.  However, before being used as indices into the input tensor,
-these are expanded in accordance to "Gather Index Mapping"
-(`gather_dims_to_operand_dims` in the formal description) and "Window Mapping"
-(`window_dims_to_operand_dims` in the formal description) into
-[`0`,`W`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up to
-[`X`,`W`<sub>`0`</sub>].  In other words, the output index
-[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`W`<sub>`0`</sub>] maps to the input index
+from the gather indices array as usual, except the starting index has only one
+element, `X`.  Similarly, there is only one output offset index with the value
+`O`<sub>`0`</sub>.  However, before being used as indices into the input array,
+these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in
+the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal
+description) into [`0`,`O`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up
+to [`X`,`O`<sub>`0`</sub>].  In other words, the output index
+[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`O`<sub>`0`</sub>] maps to the input index
 [`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`X`] which gives us
 the semantics for `tf.gather_nd`.
 
-`window_bounds` for this case is `[1,11]`.  Intuitively this means that every
-index `X` in the gather indices tensor picks an entire row and the result is the
+`slice_sizes` for this case is `[1,11]`.  Intuitively this means that every
+index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
 ## GetTupleElement
@@ -1270,7 +1351,7 @@
 let element_1: s32 = gettupleelement(t, 1);  // Inferred shape matches s32.
 ```
 
-See also @{tf.tuple}.
+See also `tf.tuple`.
 
 ## Infeed
 
@@ -1804,19 +1885,19 @@
 [`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
-the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
-output shape have to have elemental type F32. The parameters furthermore have to
-be scalar valued.
+the $$N(\mu, \sigma)$$ normal distribution. The parameters $$\mu$$ and
+$$\sigma$$, and output shape have to have a floating point elemental type. The
+parameters furthermore have to be scalar valued.
 
-<b>`RngNormal(mean, sigma, shape)`</b>
+<b>`RngNormal(mu, sigma, shape)`</b>
 
 | Arguments | Type    | Semantics                                           |
 | --------- | ------- | --------------------------------------------------- |
-| `mu`      | `XlaOp` | Scalar of type F32 specifying mean of generated     |
-:           :         : numbers                                             :
-| `sigma`   | `XlaOp` | Scalar of type F32 specifying standard deviation of |
+| `mu`      | `XlaOp` | Scalar of type T specifying mean of generated       |
+:           :         : numbers                                   :
+| `sigma`   | `XlaOp` | Scalar of type T specifying standard deviation of   |
 :           :         : generated numbers                                   :
-| `shape`   | `Shape` | Output shape of type F32                            |
+| `shape`   | `Shape` | Output shape of type T                              |
 
 ## RngUniform
 
@@ -1825,9 +1906,11 @@
 
 Constructs an output of a given shape with random numbers generated following
 the uniform distribution over the interval $$[a,b)$$. The parameters and output
-shape may be either F32, S32 or U32, but the types have to be consistent.
-Furthermore, the parameters need to be scalar valued. If $$b <= a$$ the result
-is implementation-defined.
+element type have to be a boolean type, an integral type or a floating point
+types, and the types have to be consistent. The CPU and GPU backends currently
+only support F64, F32, F16, BF16, S64, U64, S32 and U32. Furthermore, the
+parameters need to be scalar valued. If $$b <= a$$ the result is
+implementation-defined.
 
 <b>`RngUniform(a, b, shape)`</b>
 
@@ -1847,7 +1930,7 @@
 `update_computation`.
 
 See also
-[`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `scatter(operand, scatter_indices, updates, update_computation, index_vector_dim, update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims)` </b>
 
@@ -2250,7 +2333,7 @@
 
 ## Transpose
 
-See also the @{tf.reshape} operation.
+See also the `tf.reshape` operation.
 
 <b>`Transpose(operand)`</b>
 
@@ -2310,8 +2393,6 @@
     last execution of the `body`.
 *   The shape of the type `T` is statically determined and must be the same
     across all iterations.
-*   `While` nodes are not allowed to be nested. (This restriction may be lifted
-    in the future on some targets.)
 
 The T parameters of the computations are initialized with the `init` value in
 the first iteration and are automatically updated to the new result from `body`
diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
index e4b8031..2e0f377 100644
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -17,7 +17,7 @@
 The compiler is built on top of the XLA framework. The code bridging TensorFlow
 to the XLA framework resides under
 [tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/),
-which also includes support for @{$jit$just-in-time (JIT) compilation} of
+which also includes support for [just-in-time (JIT) compilation](../../performance/xla/jit.md) of
 TensorFlow graphs.
 
 ## What does tfcompile do?
@@ -116,7 +116,7 @@
 > [make_test_graphs.py]("https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/make_test_graphs.py")
 > and specify the output location with the --out_dir flag.
 
-Typical graphs contain @{$python/state_ops$`Variables`}
+Typical graphs contain [`Variables`](../../api_guides/python/state_ops.md)
 representing the weights that are learned via training, but `tfcompile` cannot
 compile a subgraph that contain `Variables`. The
 [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py)
diff --git a/tensorflow/docs_src/tutorials/_toc.yaml b/tensorflow/docs_src/tutorials/_toc.yaml
index d33869a..0e25208 100644
--- a/tensorflow/docs_src/tutorials/_toc.yaml
+++ b/tensorflow/docs_src/tutorials/_toc.yaml
@@ -37,9 +37,30 @@
     status: external
   - title: "Custom training: walkthrough"
     path: /tutorials/eager/custom_training_walkthrough
+  - title: Text generation
+    path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+    status: external
   - title: Translation with attention
     path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
     status: external
+  - title: Image captioning
+    path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+    status: external
+  - title: Neural Style Transfer
+    path: https://github.com/tensorflow/models/blob/master/research/nst_blogpost/4_Neural_Style_Transfer_with_Eager_Execution.ipynb
+    status: external
+  - title: DCGAN
+    path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+    status: external
+  - title: VAE
+    path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+    status: external
+  - title: Pix2Pix
+    path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
+    status: external
+  - title: Image Segmentation
+    path: https://github.com/tensorflow/models/blob/master/samples/outreach/blogs/segmentation_blogpost/image_segmentation.ipynb
+    status: external
 
 - title: ML at production scale
   style: accordion
diff --git a/tensorflow/docs_src/tutorials/estimators/cnn.md b/tensorflow/docs_src/tutorials/estimators/cnn.md
index 12a215b..2fd69f5 100644
--- a/tensorflow/docs_src/tutorials/estimators/cnn.md
+++ b/tensorflow/docs_src/tutorials/estimators/cnn.md
@@ -1,6 +1,6 @@
 # Build a Convolutional Neural Network using Estimators
 
-The TensorFlow @{tf.layers$`layers` module} provides a high-level API that makes
+The `tf.layers` module provides a high-level API that makes
 it easy to construct a neural network. It provides methods that facilitate the
 creation of dense (fully connected) layers and convolutional layers, adding
 activation functions, and applying dropout regularization. In this tutorial,
@@ -118,8 +118,8 @@
 Open `cnn_mnist.py` and add the following `cnn_model_fn` function, which
 conforms to the interface expected by TensorFlow's Estimator API (more on this
 later in [Create the Estimator](#create-the-estimator)). `cnn_mnist.py` takes
-MNIST feature data, labels, and
-@{tf.estimator.ModeKeys$model mode} (`TRAIN`, `EVAL`, `PREDICT`) as arguments;
+MNIST feature data, labels, and mode (from
+`tf.estimator.ModeKeys`: `TRAIN`, `EVAL`, `PREDICT`) as arguments;
 configures the CNN; and returns predictions, loss, and a training operation:
 
 ```python
@@ -190,7 +190,7 @@
 The following sections (with headings corresponding to each code block above)
 dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
-you're already experienced with CNNs and @{$custom_estimators$TensorFlow `Estimator`s},
+you're already experienced with CNNs and [TensorFlow `Estimator`s](../../guide/custom_estimators.md),
 and find the above code intuitive, you may want to skim these sections or just
 skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
@@ -277,7 +277,7 @@
 
 The `activation` argument specifies the activation function to apply to the
 output of the convolution. Here, we specify ReLU activation with
-@{tf.nn.relu}.
+`tf.nn.relu`.
 
 Our output tensor produced by `conv2d()` has a shape of
 <code>[<em>batch_size</em>, 28, 28, 32]</code>: the same height and width
@@ -423,7 +423,7 @@
 
 For a given example, our predicted class is the element in the corresponding row
 of the logits tensor with the highest raw value. We can find the index of this
-element using the @{tf.argmax}
+element using the `tf.argmax`
 function:
 
 ```python
@@ -438,7 +438,7 @@
 10]</code>).
 
 We can derive probabilities from our logits layer by applying softmax activation
-using @{tf.nn.softmax}:
+using `tf.nn.softmax`:
 
 ```python
 tf.nn.softmax(logits, name="softmax_tensor")
@@ -501,8 +501,8 @@
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
-> in the @{$custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
+> functions, see ["Defining the training op for the model"](../../guide/custom_estimators.md#defining-the-training-op-for-the-model)
+> in the ["Creating Estimations in tf.estimator"](../../guide/custom_estimators.md) tutorial.
 
 
 ### Add evaluation metrics
@@ -567,13 +567,13 @@
 feel free to change to another directory of your choice).
 
 > Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
-> tutorial @{$custom_estimators$"Creating Estimators in tf.estimator."}
+> tutorial ["Creating Estimators in tf.estimator."](../../guide/custom_estimators.md)
 
 ### Set Up a Logging Hook {#set_up_a_logging_hook}
 
 Since CNNs can take a while to train, let's set up some logging so we can track
-progress during training. We can use TensorFlow's @{tf.train.SessionRunHook} to create a
-@{tf.train.LoggingTensorHook}
+progress during training. We can use TensorFlow's `tf.train.SessionRunHook` to create a
+`tf.train.LoggingTensorHook`
 that will log the probability values from the softmax layer of our CNN. Add the
 following to `main()`:
 
@@ -593,8 +593,8 @@
 > Note: If you don't explicitly assign a name to an operation via the `name`
 > argument, TensorFlow will assign a default name. A couple easy ways to
 > discover the names applied to operations are to visualize your graph on
-> @{$graph_viz$TensorBoard}) or to enable the
-> @{$guide/debugger$TensorFlow Debugger (tfdbg)}.
+> [TensorBoard](../../guide/graph_viz.md)) or to enable the
+> [TensorFlow Debugger (tfdbg)](../../guide/debugger.md).
 
 Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the
 `tensors` argument. We set `every_n_iter=50`, which specifies that probabilities
@@ -686,9 +686,9 @@
 To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
 following resources:
 
-*   @{$custom_estimators$Creating Estimators in tf.estimator}
+*   [Creating Estimators in tf.estimator](../../guide/custom_estimators.md)
     provides an introduction to the TensorFlow Estimator API. It walks through
     configuring an Estimator, writing a model function, calculating loss, and
     defining a training op.
-*   @{$deep_cnn} walks through how to build a MNIST CNN classification model
+*   [Advanced Convolutional Neural Networks](../../tutorials/images/deep_cnn.md) walks through how to build a MNIST CNN classification model
     *without estimators* using lower-level TensorFlow operations.
diff --git a/tensorflow/docs_src/tutorials/images/deep_cnn.md b/tensorflow/docs_src/tutorials/images/deep_cnn.md
index 2796357..00996b8 100644
--- a/tensorflow/docs_src/tutorials/images/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/images/deep_cnn.md
@@ -31,26 +31,26 @@
 The CIFAR-10 tutorial demonstrates several important constructs for
 designing larger and more sophisticated models in TensorFlow:
 
-* Core mathematical components including @{tf.nn.conv2d$convolution}
+* Core mathematical components including `tf.nn.conv2d`
 ([wiki](https://en.wikipedia.org/wiki/Convolution)),
-@{tf.nn.relu$rectified linear activations}
+`tf.nn.relu`
 ([wiki](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))),
-@{tf.nn.max_pool$max pooling}
+`tf.nn.max_pool`
 ([wiki](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer))
-and @{tf.nn.local_response_normalization$local response normalization}
+and `tf.nn.local_response_normalization`
 (Chapter 3.3 in
 [AlexNet paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)).
-* @{$summaries_and_tensorboard$Visualization}
+* [Visualization](../../guide/summaries_and_tensorboard.md)
 of network activities during training, including input images,
 losses and distributions of activations and gradients.
 * Routines for calculating the
-@{tf.train.ExponentialMovingAverage$moving average}
+`tf.train.ExponentialMovingAverage`
 of learned parameters and using these averages
 during evaluation to boost predictive performance.
 * Implementation of a
-@{tf.train.exponential_decay$learning rate schedule}
+`tf.train.exponential_decay`
 that systematically decrements over time.
-* Prefetching @{tf.train.shuffle_batch$queues}
+* Prefetching `tf.train.shuffle_batch`
 for input
 data to isolate the model from disk latency and expensive image pre-processing.
 
@@ -113,28 +113,28 @@
 The input part of the model is built by the functions `inputs()` and
 `distorted_inputs()` which read images from the CIFAR-10 binary data files.
 These files contain fixed byte length records, so we use
-@{tf.FixedLengthRecordReader}.
-See @{$reading_data#reading-from-files$Reading Data} to
+`tf.FixedLengthRecordReader`.
+See [Reading Data](../../api_guides/python/reading_data.md#reading-from-files) to
 learn more about how the `Reader` class works.
 
 The images are processed as follows:
 
 *  They are cropped to 24 x 24 pixels, centrally for evaluation or
-   @{tf.random_crop$randomly} for training.
-*  They are @{tf.image.per_image_standardization$approximately whitened}
+   `tf.random_crop` for training.
+*  They are `tf.image.per_image_standardization`
    to make the model insensitive to dynamic range.
 
 For training, we additionally apply a series of random distortions to
 artificially increase the data set size:
 
-* @{tf.image.random_flip_left_right$Randomly flip} the image from left to right.
-* Randomly distort the @{tf.image.random_brightness$image brightness}.
-* Randomly distort the @{tf.image.random_contrast$image contrast}.
+* `tf.image.random_flip_left_right` the image from left to right.
+* Randomly distort the `tf.image.random_brightness`.
+* Randomly distort the `tf.image.random_contrast`.
 
-Please see the @{$python/image$Images} page for the list of
+Please see the [Images](../../api_guides/python/image.md) page for the list of
 available distortions. We also attach an
-@{tf.summary.image} to the images
-so that we may visualize them in @{$summaries_and_tensorboard$TensorBoard}.
+`tf.summary.image` to the images
+so that we may visualize them in [TensorBoard](../../guide/summaries_and_tensorboard.md).
 This is a good practice to verify that inputs are built correctly.
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -144,7 +144,7 @@
 Reading images from disk and distorting them can use a non-trivial amount of
 processing time. To prevent these operations from slowing down training, we run
 them inside 16 separate threads which continuously fill a TensorFlow
-@{tf.train.shuffle_batch$queue}.
+`tf.train.shuffle_batch`.
 
 ### Model Prediction
 
@@ -154,14 +154,14 @@
 
 Layer Name | Description
 --- | ---
-`conv1` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
-`pool1` | @{tf.nn.max_pool$max pooling}.
-`norm1` | @{tf.nn.local_response_normalization$local response normalization}.
-`conv2` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
-`norm2` | @{tf.nn.local_response_normalization$local response normalization}.
-`pool2` | @{tf.nn.max_pool$max pooling}.
-`local3` | @{$python/nn$fully connected layer with rectified linear activation}.
-`local4` | @{$python/nn$fully connected layer with rectified linear activation}.
+`conv1` | `tf.nn.conv2d` and `tf.nn.relu` activation.
+`pool1` | `tf.nn.max_pool`.
+`norm1` | `tf.nn.local_response_normalization`.
+`conv2` | `tf.nn.conv2d` and `tf.nn.relu` activation.
+`norm2` | `tf.nn.local_response_normalization`.
+`pool2` | `tf.nn.max_pool`.
+`local3` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md).
+`local4` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md).
 `softmax_linear` | linear transformation to produce logits.
 
 Here is a graph generated from TensorBoard describing the inference operation:
@@ -172,7 +172,7 @@
 
 > **EXERCISE**: The output of `inference` are un-normalized logits. Try editing
 the network architecture to return normalized predictions using
-@{tf.nn.softmax}.
+`tf.nn.softmax`.
 
 The `inputs()` and `inference()` functions provide all the components
 necessary to perform an evaluation of a model. We now shift our focus towards
@@ -190,31 +190,31 @@
 The usual method for training a network to perform N-way classification is
 [multinomial logistic regression](https://en.wikipedia.org/wiki/Multinomial_logistic_regression),
 aka. *softmax regression*. Softmax regression applies a
-@{tf.nn.softmax$softmax} nonlinearity to the
+`tf.nn.softmax` nonlinearity to the
 output of the network and calculates the
-@{tf.nn.sparse_softmax_cross_entropy_with_logits$cross-entropy}
+`tf.nn.sparse_softmax_cross_entropy_with_logits`
 between the normalized predictions and the label index.
 For regularization, we also apply the usual
-@{tf.nn.l2_loss$weight decay} losses to all learned
+`tf.nn.l2_loss` losses to all learned
 variables.  The objective function for the model is the sum of the cross entropy
 loss and all these weight decay terms, as returned by the `loss()` function.
 
-We visualize it in TensorBoard with a @{tf.summary.scalar}:
+We visualize it in TensorBoard with a `tf.summary.scalar`:
 
 ![CIFAR-10 Loss](https://www.tensorflow.org/images/cifar_loss.png "CIFAR-10 Total Loss")
 
 We train the model using standard
 [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
-algorithm (see @{$python/train$Training} for other methods)
+algorithm (see [Training](../../api_guides/python/train.md) for other methods)
 with a learning rate that
-@{tf.train.exponential_decay$exponentially decays}
+`tf.train.exponential_decay`
 over time.
 
 ![CIFAR-10 Learning Rate Decay](https://www.tensorflow.org/images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
 
 The `train()` function adds the operations needed to minimize the objective by
 calculating the gradient and updating the learned variables (see
-@{tf.train.GradientDescentOptimizer}
+`tf.train.GradientDescentOptimizer`
 for details).  It returns an operation that executes all the calculations
 needed to train and update the model for one batch of images.
 
@@ -263,9 +263,9 @@
 initially fill up the queue.  Search for `min_fraction_of_examples_in_queue`
 in `cifar10_input.py`.
 
-`cifar10_train.py` periodically @{tf.train.Saver$saves}
+`cifar10_train.py` periodically uses a  `tf.train.Saver` to save
 all model parameters in
-@{$guide/saved_model$checkpoint files}
+[checkpoint files](../../guide/saved_model.md)
 but it does *not* evaluate the model. The checkpoint file
 will be used by `cifar10_eval.py` to measure the predictive
 performance (see [Evaluating a Model](#evaluating-a-model) below).
@@ -282,10 +282,10 @@
 * Are the gradients, activations and weights reasonable?
 * What is the learning rate currently at?
 
-@{$summaries_and_tensorboard$TensorBoard} provides this
+[TensorBoard](../../guide/summaries_and_tensorboard.md) provides this
 functionality, displaying data exported periodically from `cifar10_train.py` via
 a
-@{tf.summary.FileWriter}.
+`tf.summary.FileWriter`.
 
 For instance, we can watch how the distribution of activations and degree of
 sparsity in `local3` features evolve during training:
@@ -300,7 +300,7 @@
 of noise due to the small batch size employed by training.  In practice we find
 it extremely useful to visualize their moving averages in addition to their raw
 values.  See how the scripts use
-@{tf.train.ExponentialMovingAverage}
+`tf.train.ExponentialMovingAverage`
 for this purpose.
 
 ## Evaluating a Model
@@ -336,8 +336,8 @@
 provide additional insight into the model during evaluation.
 
 The training script calculates the
-@{tf.train.ExponentialMovingAverage$moving average}
-version of all learned variables. The evaluation script substitutes
+`tf.train.ExponentialMovingAverage` of all learned variables.
+The evaluation script substitutes
 all learned model parameters with the moving average version. This
 substitution boosts model performance at evaluation time.
 
@@ -401,19 +401,19 @@
 a "tower". We must set two attributes for each tower:
 
 * A unique name for all operations within a tower.
-@{tf.name_scope} provides
+`tf.name_scope` provides
 this unique name by prepending a scope. For instance, all operations in
 the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
 
 * A preferred hardware device to run the operation within a tower.
-@{tf.device} specifies this. For
+`tf.device` specifies this. For
 instance, all operations in the first tower reside within `device('/device:GPU:0')`
 scope indicating that they should be run on the first GPU.
 
 All variables are pinned to the CPU and accessed via
-@{tf.get_variable}
+`tf.get_variable`
 in order to share them in a multi-GPU version.
-See how-to on @{$variables$Sharing Variables}.
+See how-to on [Sharing Variables](../../guide/variables.md).
 
 ### Launching and Training the Model on Multiple GPU cards
 
diff --git a/tensorflow/docs_src/tutorials/images/image_recognition.md b/tensorflow/docs_src/tutorials/images/image_recognition.md
index d545de7..52913b2 100644
--- a/tensorflow/docs_src/tutorials/images/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/images/image_recognition.md
@@ -106,7 +106,7 @@
 
 Next, we need to compile the C++ binary that includes the code to load and run the graph.
 If you've followed
-@{$install_sources$the instructions to download the source installation of TensorFlow}
+[the instructions to download the source installation of TensorFlow](../../install/install_sources.md)
 for your platform, you should be able to build the example by
 running this command from your shell terminal:
 
@@ -253,7 +253,7 @@
   TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
   return Status::OK();
 ```
-Then we create a @{tf.Session}
+Then we create a `tf.Session`
 object, which is the interface to actually running the graph, and run it,
 specifying which node we want to get the output from, and where to put the
 output data.
@@ -448,7 +448,7 @@
 covering them.
 
 To find out more about implementing convolutional neural networks, you can jump
-to the TensorFlow @{$deep_cnn$deep convolutional networks tutorial},
+to the TensorFlow [deep convolutional networks tutorial](../../tutorials/images/deep_cnn.md),
 or start a bit more gently with our [Estimator MNIST tutorial](../estimators/cnn.md).
 Finally, if you want to get up to speed on research in this area, you can
 read the recent work of all the papers referenced in this tutorial.
diff --git a/tensorflow/docs_src/tutorials/representation/kernel_methods.md b/tensorflow/docs_src/tutorials/representation/kernel_methods.md
index f3c232c..67adc49 100644
--- a/tensorflow/docs_src/tutorials/representation/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/representation/kernel_methods.md
@@ -1,9 +1,8 @@
 # Improving Linear Models Using Explicit Kernel Methods
 
-Note: This document uses a deprecated version of @{tf.estimator},
-which has a @{tf.contrib.learn.Estimator$different interface}.
-It also uses other `contrib` methods whose
-@{$version_compat#not_covered$API may not be stable}.
+Note: This document uses a deprecated version of `tf.estimator`,
+`tf.contrib.learn.Estimator`, which has a different interface. It also uses
+other `contrib` methods whose [API may not be stable](../../guide/version_compat.md#not_covered).
 
 In this tutorial, we demonstrate how combining (explicit) kernel methods with
 linear models can drastically increase the latters' quality of predictions
@@ -53,7 +52,7 @@
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$premade_estimators#create_input_functions$this section on input functions}.
+[this section on input functions](../../guide/premade_estimators.md#create_input_functions).
 In this example, we will use the `tf.train.shuffle_batch` Op which, besides
 converting numpy arrays to Tensors, allows us to specify the batch_size and
 whether to randomize the input every time the input_fn Ops are executed
@@ -90,7 +89,7 @@
 
 ## Training a simple linear model
 We can now train a linear model over the MNIST dataset. We will use the
-@{tf.contrib.learn.LinearClassifier} estimator with 10 classes representing the
+`tf.contrib.learn.LinearClassifier` estimator with 10 classes representing the
 10 digits. The input features form a 784-dimensional dense vector which can
 be specified as follows:
 
@@ -195,7 +194,7 @@
 for more details.
 
 ### Kernel classifier
-@{tf.contrib.kernel_methods.KernelLinearClassifier} is a pre-packaged
+`tf.contrib.kernel_methods.KernelLinearClassifier` is a pre-packaged
 `tf.contrib.learn` estimator that combines the power of explicit kernel mappings
 with linear models. Its constructor is almost identical to that of the
 LinearClassifier estimator with the additional option to specify a list of
diff --git a/tensorflow/docs_src/tutorials/representation/linear.md b/tensorflow/docs_src/tutorials/representation/linear.md
index 1b418cf..4f0e67f 100644
--- a/tensorflow/docs_src/tutorials/representation/linear.md
+++ b/tensorflow/docs_src/tutorials/representation/linear.md
@@ -1,6 +1,6 @@
 # Large-scale Linear Models with TensorFlow
 
-@{tf.estimator$Estimators} provides (among other things) a rich set of tools for
+`tf.estimator` provides (among other things) a rich set of tools for
 working with linear models in TensorFlow. This document provides an overview of
 those tools. It explains:
 
@@ -18,7 +18,7 @@
 
 To understand this overview it will help to have some familiarity
 with basic machine learning concepts, and also with
-@{$premade_estimators$Estimators}.
+[Estimators](../../guide/premade_estimators.md).
 
 [TOC]
 
@@ -175,7 +175,7 @@
 The input function must return a dictionary of tensors. Each key corresponds to
 the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
-@{$premade_estimators#input_fn} for a
+[Premade Estimators](../../guide/premade_estimators.md#input_fn) for a
 more comprehensive look at input functions, and `input_fn` in the
 [wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep)
 for an example implementation of an input function.
diff --git a/tensorflow/docs_src/tutorials/representation/word2vec.md b/tensorflow/docs_src/tutorials/representation/word2vec.md
index 0a1c41c..df0d317 100644
--- a/tensorflow/docs_src/tutorials/representation/word2vec.md
+++ b/tensorflow/docs_src/tutorials/representation/word2vec.md
@@ -317,7 +317,7 @@
 
 Training the model is then as simple as using a `feed_dict` to push data into
 the placeholders and calling
-@{tf.Session.run} with this new data
+`tf.Session.run` with this new data
 in a loop.
 
 ```python
@@ -383,13 +383,13 @@
 each of which require very little work on the TensorFlow back-end.  If you find
 your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
-@{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
+[New Data Formats](../../extend/new_data_formats.md).  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
 [models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
-@{$adding_an_op$Adding a New Op}.  Again we've provided an
+[Adding a New Op](../../extend/adding_an_op.md).  Again we've provided an
 example of this for the Skip-Gram case
 [models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent.md b/tensorflow/docs_src/tutorials/sequences/recurrent.md
index 715cc78..39ad441 100644
--- a/tensorflow/docs_src/tutorials/sequences/recurrent.md
+++ b/tensorflow/docs_src/tutorials/sequences/recurrent.md
@@ -77,9 +77,7 @@
 words_in_dataset = tf.placeholder(tf.float32, [time_steps, batch_size, num_features])
 lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
 # Initial state of the LSTM memory.
-hidden_state = tf.zeros([batch_size, lstm.state_size])
-current_state = tf.zeros([batch_size, lstm.state_size])
-state = hidden_state, current_state
+state = lstm.zero_state(batch_size, dtype=tf.float32)
 probabilities = []
 loss = 0.0
 for current_batch_of_words in words_in_dataset:
@@ -112,7 +110,7 @@
 
 lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
 # Initial state of the LSTM memory.
-initial_state = state = tf.zeros([batch_size, lstm.state_size])
+initial_state = state = lstm.zero_state(batch_size, dtype=tf.float32)
 
 for i in range(num_steps):
     # The value of state is updated after processing each batch of words.
@@ -140,7 +138,7 @@
 ### Inputs
 
 The word IDs will be embedded into a dense representation (see the
-@{$word2vec$Vector Representations Tutorial}) before feeding to
+[Vector Representations Tutorial](../../tutorials/representation/word2vec.md)) before feeding to
 the LSTM. This allows the model to efficiently represent the knowledge about
 particular words. It is also easy to write:
 
diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
index 37bce5b..657fab8 100644
--- a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
+++ b/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
@@ -32,7 +32,7 @@
 
 To try the code for this tutorial:
 
-1.  @{$install$Install TensorFlow} if you haven't already.
+1.  [Install TensorFlow](../../install/index.md) if you haven't already.
 1.  Download the [tutorial code]
 (https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py).
 1.  [Download the data](#download-the-data) in `TFRecord` format from
@@ -58,8 +58,7 @@
 
 We make the data that we use in this tutorial available as `TFRecord` files
 containing `TFExamples`. You can download the data from here:
-
-http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz
+<a rel="nofollow" href="http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz">http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz</a> (~1GB).
 
 Alternatively you can download the original data in `ndjson` format from the
 Google cloud and convert it to the `TFRecord` files containing `TFExamples`
@@ -108,7 +107,7 @@
 ### Optional: Converting the data
 
 To convert the `ndjson` files to
-@{$python/python_io#TFRecords_Format_Details$TFRecord} files containing
+[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files containing
 [`tf.train.Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 protos run the following command.
 
@@ -118,7 +117,7 @@
 ```
 
 This will store the data in 10 shards of
-@{$python/python_io#TFRecords_Format_Details$TFRecord} files with 10000 items
+[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files with 10000 items
 per class for the training data and 1000 items per class as eval data.
 
 This conversion process is described in more detail in the following.
@@ -220,7 +219,7 @@
 ### Defining the model
 
 To define the model we create a new `Estimator`. If you want to read more about
-estimators, we recommend @{$custom_estimators$this tutorial}.
+estimators, we recommend [this tutorial](../../guide/custom_estimators.md).
 
 To build the model, we:
 
diff --git a/tensorflow/examples/android/.gitignore b/tensorflow/examples/android/.gitignore
new file mode 100644
index 0000000..d245ab6
--- /dev/null
+++ b/tensorflow/examples/android/.gitignore
@@ -0,0 +1,29 @@
+# This file is based on https://github.com/github/gitignore/blob/master/Android.gitignore
+*.iml
+.idea/compiler.xml
+.idea/copyright
+.idea/dictionaries
+.idea/gradle.xml
+.idea/libraries
+.idea/inspectionProfiles
+.idea/misc.xml
+.idea/modules.xml
+.idea/runConfigurations.xml
+.idea/tasks.xml
+.idea/workspace.xml
+.gradle
+local.properties
+.DS_Store
+build/
+gradleBuild/
+*.apk
+*.ap_
+*.dex
+*.class
+bin/
+gen/
+out/
+*.log
+.navigation/
+/captures
+.externalNativeBuild
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 30a26d1..dac9b7a 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -45,11 +45,7 @@
 
 ## Prebuilt Components:
 
-If you just want the fastest path to trying the demo, you may download the
-nightly build
-[here](https://ci.tensorflow.org/view/Nightly/job/nightly-android/). Expand the
-"View" and then the "out" folders under "Last Successful Artifacts" to find
-tensorflow_demo.apk.
+The fastest path to trying the demo is to download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
 
 Also available are precompiled native libraries, and a jcenter package that you
 may simply drop into your own applications. See
@@ -113,8 +109,7 @@
 
 NOTE: Bazel does not currently support building for Android on Windows. Full
 support for gradle/cmake builds is coming soon, but in the meantime we suggest
-that Windows users download the [prebuilt
-binaries](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) instead.
+that Windows users download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
 
 ##### Install Bazel and Android Prerequisites
 
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 5d7bd36..64412d2 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -190,8 +190,5 @@
 "Other Linker Flags" used in the Simple Xcode project settings to strip the
 executable.
 
-After that, you can manually look at modifying the list of kernels
-included in tensorflow/contrib/makefile/tf_op_files.txt to reduce the number of
-implementations to the ones you're actually using in your own model. We're
-hoping to automate this step in the future, but for now manually removing them
-is the best approach.
+For further optimization, please refer to the ["Optimization" section](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile#optimization)
+of the makefile instructions.
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
index ed648f8..515a9e9 100644
--- a/tensorflow/g3doc/README.txt
+++ b/tensorflow/g3doc/README.txt
@@ -22,12 +22,12 @@
 at least for docs coming from Python docstrings or
 tensorflow/docs_src/.  Use:
 
-* @{tf.symbol} to make a link to the reference page for a Python
+* `tf.symbol` to make a link to the reference page for a Python
   symbol.  Note that class members don't get their own page, but the
-  syntax still works, since @{tf.MyClass.method} links to the right
+  syntax still works, since `tf.MyClass.method` links to the right
   part of the tf.MyClass page.
 
-* @{tensorflow::symbol} to make a link to the reference page for a C++
+* `tensorflow::symbol` to make a link to the reference page for a C++
   symbol. (This only works for a few symbols but will work for more soon.)
 
 * @{$doc_page} to make a link to another (not an API reference) doc
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ca1521e..3775af4 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2618,70 +2618,6 @@
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
-		Input: []tf.Input{
-			input, num_lower, num_upper,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the batched diagonal part of a batched tensor.
 //
 // This operation returns a tensor with the `diagonal` part
@@ -3383,6 +3319,42 @@
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise XOR of `x` and `y`.
+//
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseXor",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the mean along sparse segments of a tensor.
 //
 // Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
@@ -4065,64 +4037,76 @@
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// The upper regularized incomplete Gamma function is defined as:
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
-// where
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			a, x,
+			x, scale, offset, mean, variance,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
 // ApproximateEqualAttr is an optional argument to ApproximateEqual.
@@ -8435,139 +8419,6 @@
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
 type ResourceApplyFtrlAttr func(optionalAttr)
 
@@ -9512,34 +9363,216 @@
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// N is the size of the segment being reduced.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
 // This Op does not require `a_indices` be sorted in standard lexicographic order.
@@ -9840,6 +9873,139 @@
 	return op.Output(0)
 }
 
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringSplitV2Attr is an optional argument to StringSplitV2.
 type StringSplitV2Attr func(optionalAttr)
 
@@ -10013,6 +10179,24 @@
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates quantized tensors along one dimension.
 //
 // Arguments:
@@ -12594,6 +12778,65 @@
 	return scope.AddOperation(opspec)
 }
 
+// Elementwise computes the bitwise right-shift of `x` and `y`.
+//
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
 type StatelessRandomUniformAttr func(optionalAttr)
 
@@ -12670,24 +12913,6 @@
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize `SparseTensor` objects.
 //
 // The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
@@ -13962,6 +14187,24 @@
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
 type MatrixSolveLsAttr func(optionalAttr)
 
@@ -14039,24 +14282,6 @@
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MaxPool3DAttr is an optional argument to MaxPool3D.
 type MaxPool3DAttr func(optionalAttr)
 
@@ -16049,78 +16274,6 @@
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
 // RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
 type RandomStandardNormalAttr func(optionalAttr)
 
@@ -16929,216 +17082,6 @@
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
 type MutableDenseHashTableV2Attr func(optionalAttr)
 
@@ -19393,6 +19336,70 @@
 	return op.Output(0), op.Output(1)
 }
 
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SumAttr is an optional argument to Sum.
 type SumAttr func(optionalAttr)
 
@@ -20531,6 +20538,94 @@
 	return op.Output(0)
 }
 
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes gradients for SparseSegmentSqrtN.
 //
 // Returns tensor "output" with same shape as grad, except for dimension 0 whose
@@ -31821,98 +31916,3 @@
 	}
 	return scope.AddOperation(opspec)
 }
-
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LeftShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
-
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
-	return func(m optionalAttr) {
-		m["num_elements"] = value
-	}
-}
-
-// Stacks all tensors in the list.
-//
-// Requires that all tensors have the same shape.
-//
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
-//
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListStack",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 87e6107..9dce78b 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -86,7 +86,10 @@
         "src/gen/cc/op_gen_main.cc",
     ],
     copts = tf_copts(),
-    linkopts = ["-lm"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     linkstatic = 1,
     deps = [
         ":java_op_gen_lib",
@@ -368,7 +371,6 @@
             "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
             "-s",
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml
index 7fa751a..e0409fa4 100644
--- a/tensorflow/java/maven/hadoop/pom.xml
+++ b/tensorflow/java/maven/hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 8ecabfd..f9093ce 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index e03ce32..1208956 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index fee840f..755449c 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 0c33819..035077e 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.10.0-rc1</version>
+  <version>1.10.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 2af7a5c..b89f042 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index f4794d6..8c4c9d4 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -110,11 +110,17 @@
   cd "${NATIVE_DIR}"
 
   mkdir linux-x86_64
+  mkdir windows-x86_64
 
   curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-gpu-linux-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C linux-x86_64
+  curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-gpu-windows-x86_64-${TF_VERSION}.zip" -o /tmp/windows.zip
+
+  unzip /tmp/windows.zip -d windows-x86_64
+  rm -f /tmp/windows.zip
 
   # Updated timestamps seem to be required to get Maven to pick up the file.
   touch linux-x86_64/*
+  touch windows-x86_64/*
   cd "${DIR}"
 }
 
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml
index 27d9b54..31e39c5 100644
--- a/tensorflow/java/maven/spark-connector/pom.xml
+++ b/tensorflow/java/maven/spark-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index c952545..0de9024 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.10.0-rc1</version>
+    <version>1.10.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
index 0c751ae..824f7fb 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
@@ -16,6 +16,33 @@
 package org.tensorflow.types;
 
 /** Represents an 8-bit unsigned integer. */
-public class UInt8 {
+public class UInt8 extends Number {
+
+  private static final long serialVersionUID = 1L;
+  
+  // This class is only used for generic parameterization and is not instantiable. Thus,
+  // it is safe to implement the Number abstract methods with all zeros, as they will
+  // never be invoked.
+
+  @Override
+  public double doubleValue() {
+    return 0.0;
+  }
+
+  @Override
+  public float floatValue() {
+    return 0.0f;
+  }
+
+  @Override
+  public int intValue() {
+    return 0;
+  }
+
+  @Override
+  public long longValue() {
+    return 0L;
+  }
+
   private UInt8() {}
 }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2e6fb11..91c7fd1 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1870,6 +1870,7 @@
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":nn_ops_gen",
+        ":numerics",
         "@six_archive//:six",
     ],
 )
@@ -1883,7 +1884,6 @@
         ":client_testlib",
         ":clip_ops",
         ":framework_for_generated_wrappers",
-        ":numerics",
         "//third_party/py/numpy",
     ],
 )
@@ -3265,6 +3265,7 @@
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         # `layers` dependency only exists due to the use of a small utility.
@@ -3340,7 +3341,10 @@
 
 py_library(
     name = "distribute",
-    srcs = ["training/distribute.py"],
+    srcs = [
+        "training/distribute.py",
+        "training/distribution_strategy_context.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
@@ -4205,7 +4209,6 @@
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -4499,7 +4502,6 @@
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
@@ -4657,7 +4659,10 @@
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67945581
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
     deps = [
         ":array_ops",
         ":checkpoint_management",
@@ -4675,6 +4680,7 @@
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute:distribute_coordinator",
     ],
 )
 
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index c94767a..80a256b 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -15,7 +15,7 @@
 
 """Support for launching graphs and executing operations.
 
-See the @{$python/client} guide.
+See the [Client](https://tensorflow.org/api_guides/python/client) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 58a002c..1841dd9 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -724,7 +724,7 @@
     """Returns a context manager that makes this object the default session.
 
     Use with the `with` keyword to specify that calls to
-    @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in
+    `tf.Operation.run` or `tf.Tensor.eval` should be executed in
     this session.
 
     ```python
@@ -736,7 +736,7 @@
       print(c.eval())
     ```
 
-    To get the current default session, use @{tf.get_default_session}.
+    To get the current default session, use `tf.get_default_session`.
 
     *N.B.* The `as_default` context manager *does not* close the
     session when you exit the context, and you must close the session
@@ -765,7 +765,7 @@
 
     *N.B.* Entering a `with sess.as_default():` block does not affect
     the current default graph. If you are using multiple graphs, and
-    `sess.graph` is different from the value of @{tf.get_default_graph},
+    `sess.graph` is different from the value of `tf.get_default_graph`,
     you must explicitly enter a `with sess.graph.as_default():` block
     to make `sess.graph` the default graph.
 
@@ -786,14 +786,14 @@
     nested list, tuple, namedtuple, dict, or OrderedDict containing graph
     elements at its leaves.  A graph element can be one of the following types:
 
-    * An @{tf.Operation}.
+    * An `tf.Operation`.
       The corresponding fetched value will be `None`.
-    * A @{tf.Tensor}.
+    * A `tf.Tensor`.
       The corresponding fetched value will be a numpy ndarray containing the
       value of that tensor.
-    * A @{tf.SparseTensor}.
+    * A `tf.SparseTensor`.
       The corresponding fetched value will be a
-      @{tf.SparseTensorValue}
+      `tf.SparseTensorValue`
       containing the value of that sparse tensor.
     * A `get_tensor_handle` op.  The corresponding fetched value will be a
       numpy ndarray containing the handle of that tensor.
@@ -829,16 +829,16 @@
     the value of tensors in the graph. Each key in `feed_dict` can be
     one of the following types:
 
-    * If the key is a @{tf.Tensor}, the
+    * If the key is a `tf.Tensor`, the
       value may be a Python scalar, string, list, or numpy ndarray
       that can be converted to the same `dtype` as that
       tensor. Additionally, if the key is a
-      @{tf.placeholder}, the shape of
+      `tf.placeholder`, the shape of
       the value will be checked for compatibility with the placeholder.
     * If the key is a
-      @{tf.SparseTensor},
+      `tf.SparseTensor`,
       the value should be a
-      @{tf.SparseTensorValue}.
+      `tf.SparseTensorValue`.
     * If the key is a nested tuple of `Tensor`s or `SparseTensor`s, the value
       should be a nested tuple with the same structure that maps to their
       corresponding values as above.
@@ -1120,7 +1120,7 @@
     For example, if element `i` of `feed_list` is a `tf.Tensor`, the `i`th
     argument to the returned callable must be a numpy ndarray (or something
     convertible to an ndarray) with matching element type and shape. See
-    @{tf.Session.run} for details of the allowable feed key and value types.
+    `tf.Session.run` for details of the allowable feed key and value types.
 
     The returned callable will have the same return type as
     `tf.Session.run(fetches, ...)`. For example, if `fetches` is a `tf.Tensor`,
@@ -1128,14 +1128,14 @@
     it will return `None`.
 
     Args:
-      fetches: A value or list of values to fetch. See @{tf.Session.run}
+      fetches: A value or list of values to fetch. See `tf.Session.run`
         for details of the allowable fetch types.
       feed_list: (Optional.) A list of `feed_dict` keys. See
-        @{tf.Session.run} for details of the allowable feed key types.
+        `tf.Session.run` for details of the allowable feed key types.
       accept_options: (Optional.) Iff `True`, the returned `Callable` will be
-        able to accept @{tf.RunOptions} and @{tf.RunMetadata} as optional
+        able to accept `tf.RunOptions` and `tf.RunMetadata` as optional
         keyword arguments `options` and `run_metadata`, respectively, with
-        the same syntax and semantics as @{tf.Session.run}, which is useful
+        the same syntax and semantics as `tf.Session.run`, which is useful
         for certain use cases (profiling and debugging) but will result in
         measurable slowdown of the `Callable`'s performance. Default: `False`.
 
@@ -1145,7 +1145,7 @@
 
     Raises:
       TypeError: If `fetches` or `feed_list` cannot be interpreted
-        as arguments to @{tf.Session.run}.
+        as arguments to `tf.Session.run`.
     """
     if feed_list is not None:
       if not isinstance(feed_list, (list, tuple)):
@@ -1453,10 +1453,10 @@
   ```
 
   A session may own resources, such as
-  @{tf.Variable}, @{tf.QueueBase},
-  and @{tf.ReaderBase}. It is important to release
+  `tf.Variable`, `tf.QueueBase`,
+  and `tf.ReaderBase`. It is important to release
   these resources when they are no longer required. To do this, either
-  invoke the @{tf.Session.close} method on the session, or use
+  invoke the `tf.Session.close` method on the session, or use
   the session as a context manager. The following two examples are
   equivalent:
 
@@ -1500,7 +1500,7 @@
     Args:
       target: (Optional.) The execution engine to connect to.
         Defaults to using an in-process engine. See
-        @{$distributed$Distributed TensorFlow}
+        [Distributed TensorFlow](https://tensorflow.org/deploy/distributed)
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
       config: (Optional.) A
@@ -1592,8 +1592,8 @@
 
   The only difference with a regular `Session` is that an `InteractiveSession`
   installs itself as the default session on construction.
-  The methods @{tf.Tensor.eval}
-  and @{tf.Operation.run}
+  The methods `tf.Tensor.eval`
+  and `tf.Operation.run`
   will use that session to run ops.
 
   This is convenient in interactive shells and [IPython
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index b8ead6b..b0f41b8 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Utilities for API compatibility between TensorFlow release versions.
 
-See
-@{$guide/version_compat#backward_and_partial_forward_compatibility}
+See [Version
+Compatibility](https://tensorflow.org/guide/version_compat#backward_forward)
 """
 
 from __future__ import absolute_import
@@ -26,14 +26,15 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 17)
 
 
 @tf_export("compat.forward_compatible")
 def forward_compatible(year, month, day):
   """Return true if the forward compatibility window has expired.
 
-  See @{$guide/version_compat#backward_and_partial_forward_compatibility}.
+  See [Version
+  compatibility](https://tensorflow.org/guide/version_compat#backward_forward).
 
   Forward-compatibility refers to scenarios where the producer of a TensorFlow
   model (a GraphDef or SavedModel) is compiled against a version of the
@@ -91,7 +92,8 @@
 def forward_compatibility_horizon(year, month, day):
   """Context manager for testing forward compatibility of generated graphs.
 
-  See @{$guide/version_compat#backward_and_partial_forward_compatibility}.
+  See [Version
+  compatibility](https://tensorflow.org/guide/version_compat#backward_forward).
 
   To ensure forward compatibility of generated graphs (see `forward_compatible`)
   with older binaries, new features can be gated with:
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 3b9bf24..f8b5612 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """`tf.data.Dataset` API for input pipelines.
 
-See @{$guide/datasets$Importing Data} for an overview.
+See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 6cda2a7..fdab8ab 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -222,10 +222,10 @@
 
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
-    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization.  If tensors contains
     one or more large NumPy arrays, consider the alternative described in
-    @{$guide/datasets#consuming_numpy_arrays$this guide}.
+    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -241,10 +241,10 @@
 
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
-    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization.  If tensors contains
     one or more large NumPy arrays, consider the alternative described in
-    @{$guide/datasets#consuming_numpy_arrays$this guide}.
+    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -331,7 +331,7 @@
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
-    @{tf.py_func} and inherits the same constraints. In particular, it
+    `tf.py_func` and inherits the same constraints. In particular, it
     requires the `Dataset`- and `Iterator`-related operations to be placed
     on a device in the same process as the Python program that called
     `Dataset.from_generator()`. The body of `generator` will not be
@@ -641,7 +641,7 @@
         Defaults to `True`.
       seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
         random seed that will be used to create the distribution. See
-        @{tf.set_random_seed} for behavior.
+        `tf.set_random_seed` for behavior.
 
     Returns:
      Dataset: A `Dataset` of strings corresponding to file names.
@@ -706,7 +706,7 @@
         dataset will sample.
       seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
         random seed that will be used to create the distribution. See
-        @{tf.set_random_seed} for behavior.
+        `tf.set_random_seed` for behavior.
       reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
         that the dataset should be pseudorandomly reshuffled each time it is
         iterated over. (Defaults to `True`.)
@@ -863,7 +863,7 @@
     This transformation combines multiple consecutive elements of the input
     dataset into a single element.
 
-    Like @{tf.data.Dataset.batch}, the tensors in the resulting element will
+    Like `tf.data.Dataset.batch`, the tensors in the resulting element will
     have an additional outer dimension, which will be `batch_size` (or
     `N % batch_size` for the last element if `batch_size` does not divide the
     number of input elements `N` evenly and `drop_remainder` is `False`). If
@@ -871,7 +871,7 @@
     should set the `drop_remainder` argument to `True` to prevent the smaller
     batch from being produced.
 
-    Unlike @{tf.data.Dataset.batch}, the input elements to be batched may have
+    Unlike `tf.data.Dataset.batch`, the input elements to be batched may have
     different shapes, and this transformation will pad each component to the
     respective shape in `padding_shapes`. The `padding_shapes` argument
     determines the resulting shape for each dimension of each component in an
@@ -883,8 +883,8 @@
       will be padded out to the maximum length of all elements in that
       dimension.
 
-    See also @{tf.contrib.data.dense_to_sparse_batch}, which combines elements
-    that may have different shapes into a @{tf.SparseTensor}.
+    See also `tf.contrib.data.dense_to_sparse_batch`, which combines elements
+    that may have different shapes into a `tf.SparseTensor`.
 
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
@@ -1039,7 +1039,7 @@
     elements are produced. `cycle_length` controls the number of input elements
     that are processed concurrently. If you set `cycle_length` to 1, this
     transformation will handle one input element at a time, and will produce
-    identical results = to @{tf.data.Dataset.flat_map}. In general,
+    identical results = to `tf.data.Dataset.flat_map`. In general,
     this transformation will apply `map_func` to `cycle_length` input elements,
     open iterators on the returned `Dataset` objects, and cycle through them
     producing `block_length` consecutive elements from each iterator, and
@@ -1306,7 +1306,7 @@
 
 
 class _VariantDataset(Dataset):
-  """A Dataset wrapper around a @{tf.variant}-typed function argument."""
+  """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
   def __init__(self, dataset_variant, structure):
     super(_VariantDataset, self).__init__()
@@ -1342,20 +1342,20 @@
       func: A function from a nested structure to another nested structure.
       transformation_name: Human-readable name of the transformation in which
         this function is being instantiated, for error messages.
-      dataset: (Optional.) A @{tf.data.Dataset}. If given, the structure of this
+      dataset: (Optional.) A `tf.data.Dataset`. If given, the structure of this
         dataset will be assumed as the structure for `func` arguments; otherwise
         `input_classes`, `input_shapes`, and `input_types` must be defined.
       input_classes: (Optional.) A nested structure of `type`. If given, this
         argument defines the Python types for `func` arguments.
-      input_shapes: (Optional.) A nested structure of @{tf.TensorShape}. If
+      input_shapes: (Optional.) A nested structure of `tf.TensorShape`. If
         given, this argument defines the shapes and structure for `func`
         arguments.
-      input_types: (Optional.) A nested structure of @{tf.DType}. If given, this
+      input_types: (Optional.) A nested structure of `tf.DType`. If given, this
         argument defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
       experimental_nested_dataset_support: (Optional.) If `True`, the function
-        will support @{tf.data.Dataset} objects as arguments and return values.
+        will support `tf.data.Dataset` objects as arguments and return values.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -1478,7 +1478,7 @@
       self._function._create_definition_if_needed()  # pylint: disable=protected-access
 
   def _defun_args(self):
-    """Returns a flat list of @{tf.DType} for the input element structure."""
+    """Returns a flat list of `tf.DType` for the input element structure."""
     ret = []
     for input_type, input_class in zip(nest.flatten(self._input_types),
                                        nest.flatten(self._input_classes)):
@@ -1523,7 +1523,7 @@
   `**flat_structure(self)` to the op constructor.
 
   Args:
-    dataset: A @{tf.data.Dataset}.
+    dataset: A `tf.data.Dataset`.
 
   Returns:
     A dictionary of keyword arguments that can be passed to many Dataset op
@@ -1846,7 +1846,7 @@
         dataset will sample.
       seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
         random seed that will be used to create the distribution. See
-        @{tf.set_random_seed} for behavior.
+        `tf.set_random_seed` for behavior.
       reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
         that the dataset should be pseudorandomly reshuffled each time it is
         iterated over. (Defaults to `True`.)
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 83c541c..8f8e026 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -220,9 +220,9 @@
     """Creates a new, uninitialized `Iterator` based on the given handle.
 
     This method allows you to define a "feedable" iterator where you can choose
-    between concrete iterators by feeding a value in a @{tf.Session.run} call.
-    In that case, `string_handle` would a @{tf.placeholder}, and you would feed
-    it with the value of @{tf.data.Iterator.string_handle} in each step.
+    between concrete iterators by feeding a value in a `tf.Session.run` call.
+    In that case, `string_handle` would be a `tf.placeholder`, and you would
+    feed it with the value of `tf.data.Iterator.string_handle` in each step.
 
     For example, if you had two iterators that marked the current position in
     a training dataset and a test dataset, you could choose which to use in
@@ -362,9 +362,9 @@
 
     In graph mode, you should typically call this method *once* and use its
     result as the input to another computation. A typical loop will then call
-    @{tf.Session.run} on the result of that computation. The loop will terminate
+    `tf.Session.run` on the result of that computation. The loop will terminate
     when the `Iterator.get_next()` operation raises
-    @{tf.errors.OutOfRangeError}. The following skeleton shows how to use
+    `tf.errors.OutOfRangeError`. The following skeleton shows how to use
     this method when building a training loop:
 
     ```python
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 1d3007e..b75b98d 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -33,8 +33,8 @@
 
   An `Optional` can represent the result of an operation that may fail as a
   value, rather than raising an exception and halting execution. For example,
-  @{tf.contrib.data.get_next_as_optional} returns an `Optional` that either
-  contains the next value from a @{tf.data.Iterator} if one exists, or a "none"
+  `tf.contrib.data.get_next_as_optional` returns an `Optional` that either
+  contains the next value from a `tf.data.Iterator` if one exists, or a "none"
   value that indicates the end of the sequence has been reached.
   """
 
@@ -55,7 +55,7 @@
     """Returns a nested structure of values wrapped by this optional.
 
     If this optional does not have a value (i.e. `self.has_value()` evaluates
-    to `False`), this operation will raise @{tf.errors.InvalidArgumentError}
+    to `False`), this operation will raise `tf.errors.InvalidArgumentError`
     at runtime.
 
     Args:
diff --git a/tensorflow/python/data/util/convert.py b/tensorflow/python/data/util/convert.py
index 746b3d6..ba29790 100644
--- a/tensorflow/python/data/util/convert.py
+++ b/tensorflow/python/data/util/convert.py
@@ -36,11 +36,11 @@
 
 
 def partial_shape_to_tensor(shape_like):
-  """Returns a @{tf.Tensor} that represents the given shape.
+  """Returns a `tf.Tensor` that represents the given shape.
 
   Args:
-    shape_like: A value that can be converted to a @{tf.TensorShape} or a
-      @{tf.Tensor}.
+    shape_like: A value that can be converted to a `tf.TensorShape` or a
+      `tf.Tensor`.
 
   Returns:
     A 1-D `tf.Tensor` of `tf.int64` elements representing the given shape, where
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index e2c9d86..d5169f7 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -29,14 +29,14 @@
 def get_seed(seed):
   """Returns the local seeds an operation should use given an op-specific seed.
 
-  See @{tf.get_seed} for more details. This wrapper adds support for the case
+  See `tf.get_seed` for more details. This wrapper adds support for the case
   where `seed` may be a tensor.
 
   Args:
-    seed: An integer or a @{tf.int64} scalar tensor.
+    seed: An integer or a `tf.int64` scalar tensor.
 
   Returns:
-    A tuple of two @{tf.int64} scalar tensors that should be used for the local
+    A tuple of two `tf.int64` scalar tensors that should be used for the local
     seed of the calling dataset.
   """
   seed, seed2 = random_seed.get_seed(seed)
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 27b8ebd..8a4ac6a 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -936,7 +936,6 @@
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 34da44b..242215d 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Public Python API of TensorFlow Debugger (tfdbg).
 
-See the @{$python/tfdbg} guide.
+See the [TFDBG](https://tensorflow.org/api_guides/python/tfdbg) guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
diff --git a/tensorflow/python/debug/lib/debug_gradients.py b/tensorflow/python/debug/lib/debug_gradients.py
index 589a13d..5e95bcb 100644
--- a/tensorflow/python/debug/lib/debug_gradients.py
+++ b/tensorflow/python/debug/lib/debug_gradients.py
@@ -69,7 +69,7 @@
   """Gradients Debugger.
 
   Allows retrieval of gradient tensors created by TensorFlow's automatic
-  differentiation algorithm, i.e., @{tf.gradients} and optimizer classes that
+  differentiation algorithm, i.e., `tf.gradients` and optimizer classes that
   use it.
   """
   # TODO(cais): Add examples code in the doc string?
@@ -142,8 +142,8 @@
     Args:
       input_tensor: the input `tf.Tensor` object whose related gradient tensors
         are to be reigstered with this `GradientsDebugger` instance when they
-        are created, e.g., during @{tf.gradients} calls or the construction
-        of optimization (training) op that uses @{tf.gradients}.
+        are created, e.g., during `tf.gradients` calls or the construction
+        of optimization (training) op that uses `tf.gradients`.
 
     Returns:
       A forwarded identity of `input_tensor`, as a `tf.Tensor`.
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 3fac2e5..c02d5f6 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -45,7 +45,7 @@
       session_root: (`str`) Path to the session root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
-        @{tf.Session.run}
+        `tf.Session.run`
         calls.
         As the `run()` calls occur, subdirectories will be added to
         `session_root`. The subdirectories' names has the following pattern:
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 2bd0b43..98ef9bf 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -22,7 +22,7 @@
 
 py_test(
     name = "distribute_coordinator_test",
-    size = "small",
+    size = "large",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -41,3 +41,43 @@
         "//tensorflow/python:variables",
     ],
 )
+
+py_library(
+    name = "distribute_coordinator_context",
+    srcs = [
+        "distribute_coordinator_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
+py_library(
+    name = "multi_worker_util",
+    srcs = [
+        "multi_worker_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "multi_worker_util_test",
+    srcs = ["multi_worker_util_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":multi_worker_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index dab1ed4..eb081b6 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A unified and split coordinator for distributed TensorFlow."""
+"""A component for running distributed TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +24,8 @@
 import threading
 
 from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import distribute_coordinator_context
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
 
 
@@ -32,17 +34,23 @@
   WORKER = "worker"
   CHIEF = "chief"
   EVALUATOR = "evaluator"
+  CLIENT = "client"
 
 
-_worker_context = threading.local()
+# TODO(yuefengz): support another mode where the client colocates with one
+# worker.
+class CoordinatorMode(object):
+  """Specify how distribute coordinator runs."""
+  # The default mode where distribute coordinator will run as a standalone
+  # client and connects to remote servers for training.  Each remote server can
+  # use the distribute coordinator binary with task_type set correctly which
+  # will then turn into standard servers.
+  STANDALONE_CLIENT = "standalone_client"
 
-
-def get_current_worker_context():
-  """Returns the current task context."""
-  try:
-    return _worker_context.current
-  except AttributeError:
-    return None
+  # The distribute coordinator runs on each worker. It will run a standard
+  # server on each worker and optionally run the `worker_fn` that is configured
+  # to talk to its standard server.
+  INDEPENDENT_WORKER = "independent_worker"
 
 
 class _Barrier(object):
@@ -96,70 +104,72 @@
   """
 
   def __init__(self,
+               strategy,
                cluster_spec,
                task_type,
                task_id,
-               between_graph=False,
+               session_config=None,
                rpc_layer="grpc",
                worker_barrier=None):
     """Initialize the worker context object.
 
     Args:
+      strategy: a `DistributionStrategy` object.
       cluster_spec: a ClusterSpec object. It can be empty or None in the local
         training case.
       task_type: a string indicating the role of the corresponding task, such as
-        "worker" or "ps". It can be None if it is local training or
-        `between_graph` is False.
+        "worker" or "ps". It can be None if it is local training or in-graph
+        replicated training.
       task_id: an integer indicating id of the corresponding task. It can be
-        None if it is local training or `between_graph` is False.
-      between_graph: whether it is between-graph replication or not.
+        None if it is local training or in-graph replicated training.
+      session_config: an optional @{tf.ConfigProto} object.
       rpc_layer: optional string specifying the RPC protocol for communication
         with worker masters. If None or empty, hosts in the `cluster_spec` will
         be used directly.
       worker_barrier: optional, the barrier object for worker synchronization.
-
-    Raises:
-      ValueError: if task_type or task_id is Node or empty and it is distributed
-        between-graph replicated training.
     """
-    if cluster_spec and between_graph:
-      if not task_type or task_id is None:
-        raise ValueError("`task_type` and `task_id` must be set in the "
-                         "distributed between-graph replicated training.")
-      if task_type not in cluster_spec.jobs:
-        raise ValueError("`task_type` %r not found in the `cluster_spec` %r" %
-                         (task_type, cluster_spec))
+    self._strategy = strategy
     self._cluster_spec = cluster_spec
     self._task_type = task_type
     self._task_id = task_id
+    self._session_config = session_config
     self._worker_barrier = worker_barrier
     self._rpc_layer = rpc_layer
     self._master_target = self._get_master_target()
     self._num_workers = _get_num_workers(cluster_spec)
     self._is_chief_node = self._is_chief()
 
+  def _debug_message(self):
+    if self._cluster_spec:
+      return "[cluster_spec: %r, task_type: %r, task_id: %r]" % (
+          self._cluster_spec, self.task_type, self.task_id)
+    else:
+      return "[local]"
+
   def __enter__(self):
-    old_context = get_current_worker_context()
+    old_context = distribute_coordinator_context.get_current_worker_context()
     if old_context:
       raise ValueError(
-          "You cannot run distribute coordinator in a `worker_fn`.")
-    _worker_context.current = self
+          "You cannot run distribute coordinator in a `worker_fn`.\t" +
+          self._debug_message())
+    # pylint: disable=protected-access
+    distribute_coordinator_context._worker_context.current = self
 
   def __exit__(self, unused_exception_type, unused_exception_value,
                unused_traceback):
-    _worker_context.current = None
+    # pylint: disable=protected-access
+    distribute_coordinator_context._worker_context.current = None
 
   def _get_master_target(self):
     """Return the master target for a task."""
     # If cluster_spec is None or empty, we use local master.
     if not self._cluster_spec:
-      return "local"
+      return ""
 
     # If task_type is None, then it is in-graph replicated training. In this
     # case we use the chief or first worker's master target.
     if not self._task_type:
       if _TaskType.CHIEF in self._cluster_spec.jobs:
-        assert not self.between_graph
         task_type = _TaskType.CHIEF
         task_id = 0
       else:
@@ -177,7 +187,8 @@
 
   def _is_chief(self):
     """Return whether the task is the chief worker."""
-    if (not self._cluster_spec or self._task_type in [_TaskType.CHIEF, None]):
+    if (not self._cluster_spec or
+        self._task_type in [_TaskType.CHIEF, _TaskType.EVALUATOR, None]):
       return True
 
     # If not local and chief not in the cluster_spec, use the first worker as
@@ -194,14 +205,60 @@
       ValueError: if `worker_barrier` is not passed to the __init__ method.
     """
     if not self._worker_barrier:
-      raise ValueError(
-          "`worker_barrier is not set in the worker context.`")
+      raise ValueError("`worker_barrier is not set in the worker context.` \t" +
+                       self._debug_message())
     self._worker_barrier.wait()
 
+  def session_creator(self,
+                      scaffold=None,
+                      config=None,
+                      checkpoint_dir=None,
+                      checkpoint_filename_with_path=None,
+                      max_wait_secs=7200):
+    """Returns a session creator.
+
+    The returned session creator will be configured with the correct master
+    target and session configs. It will also run either init ops or ready ops
+    by querying the `strategy` object when `create_session` is called on it.
+
+    Args:
+      scaffold: A `Scaffold` used for gathering or building supportive ops. If
+        not specified a default one is created. It's used to finalize the graph.
+      config: `ConfigProto` proto used to configure the session.
+      checkpoint_dir: A string. Optional path to a directory where to restore
+        variables.
+      checkpoint_filename_with_path: Full file name path to the checkpoint file.
+        Only one of `checkpoint_dir` or `checkpoint_filename_with_path` can be
+        specified.
+      max_wait_secs: Maximum time to wait for the session to become available.
+
+    Returns:
+      a descendant of SessionCreator.
+    """
+    # TODO(yuefengz): merge session config.
+    if self._strategy.should_init:
+      return monitored_session.ChiefSessionCreator(
+          scaffold,
+          master=self.master_target,
+          config=config or self._session_config,
+          checkpoint_dir=checkpoint_dir,
+          checkpoint_filename_with_path=checkpoint_filename_with_path)
+    else:
+      return monitored_session.WorkerSessionCreator(
+          scaffold,
+          master=self.master_target,
+          config=config or self._session_config,
+          max_wait_secs=max_wait_secs)
+
+  @property
+  def has_barrier(self):
+    """Whether the barrier is set or not."""
+    return self._worker_barrier is not None
+
   @property
   def distributed_mode(self):
     """Whether it is distributed training or not."""
-    return bool(self._cluster_spec)
+    return bool(self._cluster_spec) and self._task_type != _TaskType.EVALUATOR
 
   @property
   def cluster_spec(self):
@@ -233,25 +290,141 @@
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def should_checkpoint(self):
+    """Whether to save checkpoint."""
+    return self._strategy.should_checkpoint
 
-def _run(worker_fn, cluster_spec, task_type, task_id, between_graph, rpc_layer,
-         worker_barrier):
-  with _WorkerContext(cluster_spec, task_type, task_id, between_graph,
-                      rpc_layer, worker_barrier):
-    worker_fn()
+  @property
+  def should_save_summary(self):
+    """Whether to save summaries."""
+    return self._strategy.should_save_summary
 
 
+def _run_single_worker(worker_fn,
+                       strategy,
+                       cluster_spec,
+                       task_type,
+                       task_id,
+                       session_config,
+                       rpc_layer="",
+                       worker_barrier=None):
+  """Runs a single worker by calling `worker_fn` under context."""
+  strategy = copy.deepcopy(strategy)
+  strategy.configure(session_config, cluster_spec, task_type, task_id)
+  context = _WorkerContext(
+      strategy,
+      cluster_spec,
+      task_type,
+      task_id,
+      session_config=session_config,
+      rpc_layer=rpc_layer,
+      worker_barrier=worker_barrier)
+  with context:
+    worker_fn(strategy)
+
+
+def _run_std_server(cluster_spec=None,
+                    task_type=None,
+                    task_id=None,
+                    session_config=None,
+                    rpc_layer=None):
+  """Runs a standard server."""
+  server = server_lib.Server(
+      cluster_spec,
+      job_name=task_type,
+      task_index=task_id,
+      config=session_config,
+      protocol=rpc_layer)
+  server.start()
+  return server
+
+
+def _run_between_graph_client(worker_fn, strategy, cluster_spec, session_config,
+                              rpc_layer):
+  """Runs a standalone client for between-graph replication."""
+  eval_thread = None
+  if _TaskType.EVALUATOR in cluster_spec.jobs:
+    eval_thread = threading.Thread(
+        target=_run_single_worker,
+        args=(worker_fn, strategy, cluster_spec, _TaskType.EVALUATOR, 0,
+              session_config),
+        kwargs={
+            "rpc_layer": rpc_layer,
+        })
+    eval_thread.start()
+
+  threads = []
+  worker_barrier = _Barrier(_get_num_workers(cluster_spec))
+  for task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
+    for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+      t = threading.Thread(
+          target=_run_single_worker,
+          args=(worker_fn, strategy, cluster_spec, task_type, task_id,
+                session_config),
+          kwargs={
+              "rpc_layer": rpc_layer,
+              "worker_barrier": worker_barrier
+          })
+      t.start()
+      threads.append(t)
+
+  # TODO(yuefengz): wrap threads into thread coordinator?
+  for t in threads:
+    t.join()
+
+  # TODO(yuefengz): is it necessary to join eval thread?
+  if eval_thread:
+    eval_thread.join()
+
+
+def _run_in_graph_client(worker_fn, strategy, cluster_spec, session_config,
+                         rpc_layer):
+  """Runs a standalone client for in-graph replication."""
+  eval_thread = None
+  if _TaskType.EVALUATOR in cluster_spec.jobs:
+    eval_thread = threading.Thread(
+        target=_run_single_worker,
+        args=(worker_fn, strategy, cluster_spec, _TaskType.EVALUATOR, 0,
+              session_config),
+        kwargs={
+            "rpc_layer": rpc_layer,
+        })
+    eval_thread.start()
+
+  _run_single_worker(
+      worker_fn,
+      strategy,
+      cluster_spec,
+      None,
+      None,
+      session_config,
+      rpc_layer=rpc_layer)
+  if eval_thread:
+    eval_thread.join()
+
+# TODO(yuefengz): propagate cluster_spec in the STANDALONE_CLIENT mode.
+# TODO(yuefengz): we may need a smart way to figure out whether the current task
+# is the special task when we support cluster_spec propagation.
 def run_distribute_coordinator(worker_fn,
+                               strategy,
+                               mode=CoordinatorMode.STANDALONE_CLIENT,
                                cluster_spec=None,
-                               between_graph=False,
-                               rpc_layer=None):
-  """Run the coordinator for distributed TensorFlow.
+                               task_type=None,
+                               task_id=None,
+                               session_config=None,
+                               rpc_layer="grpc"):
+  """Runs the coordinator for distributed TensorFlow.
 
-  This function runs a unified and split coordinator for distributed TensorFlow.
-  Given a `cluster_spec` specifying server addresses and their roles in a
-  cluster, this coordinator will figure out how to set them up, give the
-  underlying function the right targets for master sessions and coordinate their
-  training.
+  This function runs a split coordinator for distributed TensorFlow in its
+  default mode, i.e the STANDALONE_CLIENT mode. Given a `cluster_spec`
+  specifying server addresses and their roles in a cluster, this coordinator
+  will figure out how to set them up, give the underlying function the right
+  targets for master sessions via a scope object and coordinate their training.
+  The cluster consisting of standard servers needs to be brought up either with
+  the standard server binary or with a binary running distribute coordinator
+  with `task_type` set to non-client type which will then turn into standard
+  servers.
 
   In addition to be the distribute coordinator, this is also the source of
   configurations for each job in the distributed training. As there are multiple
@@ -261,9 +434,22 @@
 
   In the between-graph replicated training, this coordinator will create
   multiple threads and each calls the `worker_fn` which is supposed to create
-  its own graph and connect to one worker master given by its coordinator
-  context. In the in-graph replicated training, it has only one thread calling
-  this `worker_fn`.
+  its own graph and connect to one worker master given by its context object. In
+  the in-graph replicated training, it has only one thread calling this
+  `worker_fn`.
+
+  Another mode is the INDEPENDENT_WORKER mode where each server runs a
+  distribute coordinator which will start a standard server and optionally runs
+  `worker_fn` depending whether it is between-graph training or in-graph
+  replicated training.
+
+  The `strategy` object is expected to be a DistributionStrategy object which
+  has implemented methods needed by distributed coordinator such as
+  `configure(session_config, cluster_spec, task_type, task_id)` which configures
+  the strategy object for a specific task and `should_init` property which
+  instructs the distribute coordinator whether to run init ops for a task. The
+  distribute coordinator will make a copy of the `strategy` object, call its
+  `configure` method and pass it to `worker_fn` as an argument.
 
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
@@ -274,13 +460,14 @@
   `worker_fn` or to define different environment variables for different
   `worker_fn`s.
 
-  The `worker_fn` for the between-graph replication is defined as if there are
-  only one worker corresponding to the `worker_fn` and possibly ps jobs. It
-  assigns variables to parameter servers and all other operations to that
-  worker. In the in-graph replication case, the `worker_fn` has to define
-  operations for all worker jobs. Using a distribution strategy can simplify the
-  `worker_fn` by not having to worry about the replication and device assignment
-  of variables and operations.
+  The `worker_fn` for the between-graph replication is defined as if there is
+  only one worker corresponding to the `worker_fn` and possibly ps jobs. For
+  example, when training with parameter servers, it assigns variables to
+  parameter servers and all other operations to that worker. In the in-graph
+  replication case, the `worker_fn` has to define operations for all worker
+  jobs. Using a distribution strategy can simplify the `worker_fn` by not having
+  to worry about the replication and device assignment of variables and
+  operations.
 
   This method is intended to be invoked by high-level APIs so that users don't
   have to explictly call it to run this coordinator. For those who don't use
@@ -307,22 +494,33 @@
   evaluation.
 
   Args:
-    worker_fn: the function to be called and given the access to a coordinator
-      context object.
+    worker_fn: the function to be called. The function should accept a
+      `strategy` object and will be given access to a context object via a
+      context manager scope.
+    strategy: a DistributionStrategy object which specifying whether it should
+      run between-graph replicated training or not, whether to run init ops,
+      etc. This object will also be configured given `session_config`,
+      `cluster_spc`, `task_type` and `task_id`.
+    mode: in which mode this distribute coordinator runs.
     cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles
       in a cluster. If not set or empty, fall back to local training.
-    between_graph: a boolean. It is only useful when `cluster_spec` is set and
-      not empty. If true, it will use between-graph replicated training;
-      otherwise it will use in-graph replicated training.
+    task_type: the current task type, optional if this is a client.
+    task_id: the current task id, optional if this is a client.
+    session_config: an optional @{tf.ConfigProto} object which will be passed
+      to `strategy`'s `configure` method and used to create a session.
     rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
 
   Raises:
     ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef or
       a ClusterSpec.
   """
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
   if not cluster_spec:
-    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
     cluster_spec = tf_config.get("cluster", {})
+    task_env = tf_config.get("task", {})
+    if task_env:
+      task_type = task_env.get("type", task_type)
+      task_id = int(task_env.get("index", task_id))
 
   if cluster_spec:
     if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
@@ -333,29 +531,50 @@
           "`tf.train.ClusterDef` object")
     # TODO(yuefengz): validate cluster_spec.
 
-  threads = []
-  if cluster_spec and _TaskType.EVALUATOR in cluster_spec.jobs:
-    t = threading.Thread(
-        target=_run,
-        args=(worker_fn, cluster_spec, _TaskType.EVALUATOR, 0, between_graph,
-              rpc_layer, None))
-    t.start()
-    threads.append(t)
-
-  if cluster_spec and between_graph:
-    worker_barrier = _Barrier(_get_num_workers(cluster_spec))
-    for task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
-        t = threading.Thread(
-            target=_run,
-            args=(worker_fn, cluster_spec, task_type, task_id, between_graph,
-                  rpc_layer, worker_barrier))
-        t.start()
-        threads.append(t)
+  if not cluster_spec:
+    # `mode` is ignored in the local case.
+    _run_single_worker(worker_fn, strategy, None, None, None, session_config,
+                       rpc_layer)
+  elif mode == CoordinatorMode.STANDALONE_CLIENT:
+    # The client must know the cluster but servers in the cluster don't have to
+    # know the client.
+    if task_type in [_TaskType.CLIENT, None]:
+      if strategy.between_graph:
+        _run_between_graph_client(worker_fn, strategy, cluster_spec,
+                                  session_config, rpc_layer)
+      else:
+        _run_in_graph_client(worker_fn, strategy, cluster_spec, session_config,
+                             rpc_layer)
+    else:
+      # If not a client job, run the standard server.
+      server = _run_std_server(
+          cluster_spec=cluster_spec, task_type=task_type, task_id=task_id)
+      server.join()
   else:
-    # Local or in-graph replicated training.
-    _run(worker_fn, cluster_spec, None, None, between_graph, rpc_layer, None)
+    if mode != CoordinatorMode.INDEPENDENT_WORKER:
+      raise ValueError("Unexpected coordinator mode: %r" % mode)
 
-  # TODO(yuefengz): wrapper threads into thread coordinator?
-  for t in threads:
-    t.join()
+    # Every one starts a standard server.
+    server = _run_std_server(
+        cluster_spec=cluster_spec, task_type=task_type, task_id=task_id)
+
+    if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
+      if strategy.between_graph:
+        # All jobs run `worker_fn` if between-graph.
+        _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
+                           task_id, session_config, rpc_layer)
+      else:
+        # Only one node runs `worker_fn` if in-graph.
+        context = _WorkerContext(strategy, cluster_spec, task_type, task_id)
+        if context.is_chief:
+          _run_single_worker(worker_fn, strategy, cluster_spec, None, None,
+                             session_config, rpc_layer)
+        else:
+          server.join()
+    elif task_type == _TaskType.EVALUATOR:
+      _run_single_worker(worker_fn, strategy, cluster_spec, task_type, task_id,
+                         session_config, rpc_layer)
+    else:
+      if task_type != _TaskType.PS:
+        raise ValueError("Unexpected task_type: %r" % task_type)
+      server.join()
diff --git a/tensorflow/python/distribute/distribute_coordinator_context.py b/tensorflow/python/distribute/distribute_coordinator_context.py
new file mode 100644
index 0000000..dee65ce
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_coordinator_context.py
@@ -0,0 +1,31 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The context retrieval method for distribute coordinator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+_worker_context = threading.local()
+
+
+def get_current_worker_context():
+  """Returns the current task context."""
+  try:
+    return _worker_context.current
+  except AttributeError:
+    return None
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index d7ffeb5..97c6bdd 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for distribute coordinator."""
+"""Tests for Distribute Coordinator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,12 +20,24 @@
 
 import contextlib
 import copy
+import os
+import sys
 import threading
 import six
 
+# pylint: disable=invalid-name
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:
+  _portpicker_import_error = _error
+  portpicker = None
+# pylint: enable=invalid-name
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator
+from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -33,12 +45,17 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
 WORKER = distribute_coordinator._TaskType.WORKER
 PS = distribute_coordinator._TaskType.PS
 EVALUATOR = distribute_coordinator._TaskType.EVALUATOR
 
+STANDALONE_CLIENT = distribute_coordinator.CoordinatorMode.STANDALONE_CLIENT
+INDEPENDENT_WORKER = distribute_coordinator.CoordinatorMode.INDEPENDENT_WORKER
+
 NUM_WORKERS = 3
 NUM_PS = 2
 
@@ -50,7 +67,80 @@
     return str(maybe_bytes, "utf-8")
 
 
-class DistributeCoordinatorTest(test.TestCase):
+def _strip_protocol(target):
+  # cluster_spec expects "host:port" strings.
+  if "//" in target:
+    return target.split("//")[1]
+  else:
+    return target
+
+
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self._between_graph = between_graph
+    self._should_init = should_init
+    self._should_checkpoint = should_checkpoint
+    self._should_save_summary = should_save_summary
+
+  @property
+  def between_graph(self):
+    return self._between_graph
+
+  def configure(self,
+                session_options=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    del session_options, cluster_spec, task_type
+    if self._should_init is None:
+      if task_id == 0:
+        self._should_init = True
+      else:
+        self._should_init = False
+    if self._should_checkpoint is None:
+      if task_id == 0:
+        self._should_checkpoint = True
+      else:
+        self._should_checkpoint = False
+    if self._should_save_summary is None:
+      if task_id == 0:
+        self._should_save_summary = True
+      else:
+        self._should_save_summary = False
+
+  @property
+  def should_init(self):
+    return self._should_init
+
+  @property
+  def should_checkpoint(self):
+    return self._should_checkpoint
+
+  @property
+  def should_save_summary(self):
+    return self._should_save_summary
+
+
+class MockServer(object):
+
+  def __init__(self):
+    self._joined = False
+
+  def join(self):
+    assert not self._joined
+    self._joined = True
+
+  @property
+  def joined(self):
+    return self._joined
+
+
+class DistributeCoordinatorTestBase(test.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -60,14 +150,19 @@
     cls._workers, cls._ps = test_util.create_local_cluster(
         NUM_WORKERS, num_ps=NUM_PS)
     cls._cluster_spec = {
-        WORKER: [_bytes_to_str(w.target) for w in cls._workers],
-        PS: [_bytes_to_str(ps.target) for ps in cls._ps]
+        WORKER: [
+            _strip_protocol(_bytes_to_str(w.target)) for w in cls._workers
+        ],
+        PS: [_strip_protocol(_bytes_to_str(ps.target)) for ps in cls._ps]
     }
 
   def setUp(self):
     self._result_correct = 0
     self._lock = threading.Lock()
     self._worker_context = {}
+    self._strategy_property = {}
+    self._std_servers = {}
+    self._barrier = distribute_coordinator._Barrier(NUM_WORKERS)
 
   @contextlib.contextmanager
   def _test_session(self, target):
@@ -76,8 +171,32 @@
     with session.Session(graph=None, config=config, target=target) as sess:
       yield sess
 
-  def _in_graph_worker_fn(self):
-    context = distribute_coordinator.get_current_worker_context()
+  def _create_cluster_spec(self,
+                           has_chief=False,
+                           num_workers=1,
+                           num_ps=0,
+                           has_eval=False):
+    if _portpicker_import_error:
+      raise _portpicker_import_error  # pylint: disable=raising-bad-type
+
+    cluster_spec = {}
+    if has_chief:
+      cluster_spec[CHIEF] = ["localhost:%s" % portpicker.pick_unused_port()]
+    if num_workers:
+      cluster_spec[WORKER] = [
+          "localhost:%s" % portpicker.pick_unused_port()
+          for _ in range(num_workers)
+      ]
+    if num_ps:
+      cluster_spec[PS] = [
+          "localhost:%s" % portpicker.pick_unused_port() for _ in range(num_ps)
+      ]
+    if has_eval:
+      cluster_spec[EVALUATOR] = ["localhost:%s" % portpicker.pick_unused_port()]
+    return cluster_spec
+
+  def _in_graph_worker_fn(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     with self._test_session(target=context.master_target) as sess:
       xs = []
@@ -98,16 +217,32 @@
     if result_value == expected:
       self._result_correct += 1
 
-  def testInGraph(self):
-    """Test it runs in-graph replicated training correctly."""
-    distribute_coordinator.run_distribute_coordinator(
-        self._in_graph_worker_fn,
-        cluster_spec=self._cluster_spec,
-        between_graph=False)
-    self.assertEqual(self._result_correct, 1)
+  def _run_coordinator_in_thread(self, worker_fn, strategy, **kwargs):
+    t = threading.Thread(
+        target=distribute_coordinator.run_distribute_coordinator,
+        args=(worker_fn, strategy),
+        kwargs=kwargs)
+    t.start()
+    return t
 
-  def _between_graph_worker_fn(self):
-    context = distribute_coordinator.get_current_worker_context()
+  def _run_multiple_coordinator_in_threads(self, worker_fn, strategy,
+                                           cluster_spec, **kwargs):
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_coordinator_in_thread(
+            worker_fn,
+            strategy,
+            cluster_spec=cluster_spec,
+            task_type=task_type,
+            task_id=task_id,
+            **kwargs)
+        threads[task_type].append(t)
+    return threads
+
+  def _between_graph_worker_fn(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     with self._test_session(target=context.master_target) as sess:
       with ops.device("/job:ps/task:0"):
@@ -127,13 +262,23 @@
         variables.global_variables_initializer().run()
 
       # Synchronize workers after initializaton.
-      context.wait_for_other_workers()
+      if context.has_barrier:
+        context.wait_for_other_workers()
+      else:
+        while True:
+          uninit_vars = sess.run(variables.report_uninitialized_variables())
+          # pylint: disable=g-explicit-length-test
+          if len(uninit_vars) == 0:
+            break
 
       sess.run(train_op)
 
       # Synchronize workers after one step to make sure they all have finished
       # training.
-      context.wait_for_other_workers()
+      if context.has_barrier:
+        context.wait_for_other_workers()
+      else:
+        self._barrier.wait()
 
       x_val, y_val = sess.run([x, y])
 
@@ -143,24 +288,50 @@
         with self._lock:
           self._result_correct += 1
 
-  def testBetweenGraph(self):
-    """Test it runs between-graph replicated training correctly."""
-    distribute_coordinator.run_distribute_coordinator(
-        self._between_graph_worker_fn,
-        cluster_spec=self._cluster_spec,
-        between_graph=True)
+  def _between_graph_with_monitored_session(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
+    self.assertTrue(context is not None)
+    with ops.device("/job:ps/task:0"):
+      # TODO(yuefengz): investigate why not using resource variable will make
+      # the test flaky.
+      x = variable_scope.get_variable("x", initializer=10.0, use_resource=True)
+    with ops.device("/job:ps/task:1"):
+      y = variable_scope.get_variable("y", initializer=20.0, use_resource=True)
 
-    # Each finished worker will increment self._result_correct.
-    self.assertEqual(self._result_correct, NUM_WORKERS)
+    x_add = x.assign_add(2.0)
+    y_sub = y.assign_sub(2.0)
+    train_op = control_flow_ops.group([x_add, y_sub])
 
-  def _dump_worker_context(self):
+    # The monitored session will run init or ready ops.
+    with monitored_session.MonitoredSession() as sess:
+      sess.run(train_op)
+
+      # Synchronize workers after one step to make sure they all have finished
+      # training.
+      if context.has_barrier:
+        context.wait_for_other_workers()
+      else:
+        self._barrier.wait()
+
+      x_val, y_val = sess.run([x, y])
+
+    self.assertEqual(x_val, 16.0)
+    self.assertEqual(y_val, 14.0)
+    if x_val == 16.0 and y_val == 14.0:
+      with self._lock:
+        self._result_correct += 1
+
+  def _dump_worker_context(self, strategy):
     """Dumps the propoerties of each worker context.
 
     It dumps the context properties to a dict mapping from task_type to a list
     of tuples of master_target, num_workers, is_chief and distribute_mode, where
     the list is indexed by the task_id.
+
+    Args:
+      strategy: a `DistributionStrategy` object.
     """
-    context = distribute_coordinator.get_current_worker_context()
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -174,12 +345,80 @@
                                                   context.is_chief,
                                                   context.distributed_mode)
 
+  def _dump_strategy_property(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
+    self.assertTrue(context is not None)
+
+    self.assertEqual(context._strategy.should_init, strategy.should_init)
+    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
+    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+
+    task_type = str(context.task_type)
+    task_id = context.task_id or 0
+    with self._lock:
+      if task_type not in self._strategy_property:
+        self._strategy_property[task_type] = []
+      while len(self._strategy_property[task_type]) <= task_id:
+        self._strategy_property[task_type].append(None)
+      self._strategy_property[task_type][task_id] = (
+          context._strategy.should_init, context.should_checkpoint,
+          context.should_save_summary)
+
+  def _run_mock_std_server(self,
+                           session_config=None,
+                           cluster_spec=None,
+                           task_type=None,
+                           task_id=None,
+                           rpc_layer=None):
+    task_type = str(task_type)
+    task_id = task_id or 0
+    with self._lock:
+      if task_type not in self._std_servers:
+        self._std_servers[task_type] = []
+      while len(self._std_servers[task_type]) <= task_id:
+        self._std_servers[task_type].append(None)
+
+      server = MockServer()
+      self._std_servers[task_type][task_id] = server
+    return server
+
+
+class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
+
+  def testInGraphStandaloneMode(self):
+    """Test it runs in-graph replication in standalone client mode."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._in_graph_worker_fn,
+        MockStrategy(between_graph=False),
+        cluster_spec=self._cluster_spec)
+    self.assertEqual(self._result_correct, 1)
+
+  def testBetweenGraph(self):
+    """Test it runs between-graph replication in standalone client mode."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._between_graph_worker_fn,
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def testBetweenGraphWithMonitoredSession(self):
+    """Test monitored session in standalone client mode."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._between_graph_with_monitored_session,
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
   def testBetweenGraphContext(self):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
-        cluster_spec=self._cluster_spec,
-        between_graph=True)
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
 
     # There is only one type of task and there three such tasks.
     self.assertEqual(len(self._worker_context), 1)
@@ -198,12 +437,30 @@
         self._worker_context[WORKER][2],
         (_bytes_to_str(self._workers[2].target), NUM_WORKERS, False, True))
 
+  def testBetweenGraphStrategyProperties(self):
+    # Dumps properties of the strategy objects.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_strategy_property,
+        MockStrategy(between_graph=True, should_init=True),
+        cluster_spec=self._cluster_spec)
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._strategy_property), 1)
+    self.assertTrue(WORKER in self._strategy_property)
+    self.assertEqual(len(self._strategy_property[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right properties of should_init,
+    # should_checkpoint and should_save_summary.
+    self.assertEqual(self._strategy_property[WORKER][0], (True, True, True))
+    self.assertEqual(self._strategy_property[WORKER][1], (True, False, False))
+    self.assertEqual(self._strategy_property[WORKER][2], (True, False, False))
+
   def testInGraphContext(self):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
-        cluster_spec=self._cluster_spec,
-        between_graph=False)
+        MockStrategy(between_graph=False),
+        cluster_spec=self._cluster_spec)
 
     # There is only a "None" task in the dumped task context.
     self.assertEqual(len(self._worker_context), 1)
@@ -219,7 +476,9 @@
   def testLocalContext(self):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
-        self._dump_worker_context, cluster_spec=None, between_graph=True)
+        self._dump_worker_context,
+        MockStrategy(between_graph=False),
+        cluster_spec=None)
 
     # There is only a "None" task.
     self.assertEqual(len(self._worker_context), 1)
@@ -228,7 +487,7 @@
 
     # Check whether each task has the right master_target, num_workers, is_chief
     # and distributed_mode.
-    self.assertEqual(self._worker_context["None"][0], ("local", 0, True, False))
+    self.assertEqual(self._worker_context["None"][0], ("", 0, True, False))
 
   def testBetweenGraphContextWithChief(self):
     # Adds a chief node, so there are NUM_WORKERS + 1 workers in total.
@@ -238,8 +497,8 @@
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
+        MockStrategy(between_graph=True),
         cluster_spec=cluster_spec,
-        between_graph=True,
         rpc_layer="grpc")
 
     # There are one CHIEF and three workers.
@@ -253,15 +512,15 @@
     # and distributed_mode.
     self.assertEqual(self._worker_context[CHIEF][0],
                      ("grpc://fake_chief", 4, True, True))
-    self.assertEqual(self._worker_context[WORKER][0],
-                     ("grpc://" + _bytes_to_str(self._workers[0].target),
-                      NUM_WORKERS + 1, False, True))
-    self.assertEqual(self._worker_context[WORKER][1],
-                     ("grpc://" + _bytes_to_str(self._workers[1].target),
-                      NUM_WORKERS + 1, False, True))
-    self.assertEqual(self._worker_context[WORKER][2],
-                     ("grpc://" + _bytes_to_str(self._workers[2].target),
-                      NUM_WORKERS + 1, False, True))
+    self.assertEqual(
+        self._worker_context[WORKER][0],
+        (_bytes_to_str(self._workers[0].target), NUM_WORKERS + 1, False, True))
+    self.assertEqual(
+        self._worker_context[WORKER][1],
+        (_bytes_to_str(self._workers[1].target), NUM_WORKERS + 1, False, True))
+    self.assertEqual(
+        self._worker_context[WORKER][2],
+        (_bytes_to_str(self._workers[2].target), NUM_WORKERS + 1, False, True))
 
   def testInGraphContextWithEval(self):
     # Adds a EVALUATOR job.
@@ -271,8 +530,180 @@
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
+        MockStrategy(between_graph=False),
         cluster_spec=cluster_spec,
-        between_graph=False)
+        rpc_layer=None)
+
+    # There are one "None" task and one EVALUATOR task.
+    self.assertEqual(len(self._worker_context), 2)
+    self.assertTrue("None" in self._worker_context)
+    self.assertTrue(EVALUATOR in self._worker_context)
+    self.assertEqual(len(self._worker_context["None"]), 1)
+    self.assertEqual(len(self._worker_context[EVALUATOR]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._worker_context["None"][0], (_strip_protocol(
+        _bytes_to_str(self._workers[0].target)), 3, True, True))
+    self.assertEqual(self._worker_context[EVALUATOR][0],
+                     ("fake_evaluator", 3, True, False))
+
+
+class DistributeCoordinatorTestInpendentWorkerMode(
+    DistributeCoordinatorTestBase):
+
+  def testInGraph(self):
+    cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
+    threads = self._run_multiple_coordinator_in_threads(
+        self._in_graph_worker_fn,
+        MockStrategy(between_graph=False),
+        cluster_spec,
+        mode=INDEPENDENT_WORKER)
+    threads[WORKER][0].join()
+    self.assertEqual(self._result_correct, 1)
+
+  def testBetweenGraph(self):
+    cluster_spec = self._create_cluster_spec(
+        num_workers=NUM_WORKERS, num_ps=NUM_PS)
+    threads = self._run_multiple_coordinator_in_threads(
+        self._between_graph_worker_fn,
+        MockStrategy(between_graph=True),
+        cluster_spec,
+        mode=INDEPENDENT_WORKER)
+    for task_id in range(NUM_WORKERS):
+      threads[WORKER][task_id].join()
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def testBetweenGraphWithMonitoredSession(self):
+    cluster_spec = self._create_cluster_spec(
+        num_workers=NUM_WORKERS, num_ps=NUM_PS)
+    threads = self._run_multiple_coordinator_in_threads(
+        self._between_graph_with_monitored_session,
+        MockStrategy(between_graph=True),
+        cluster_spec,
+        mode=INDEPENDENT_WORKER)
+    for task_id in range(NUM_WORKERS):
+      threads[WORKER][task_id].join()
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def testBetweenGraphContext(self):
+    cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
+    # Dumps the task contexts and std server arguments.
+    with test.mock.patch.object(distribute_coordinator, "_run_std_server",
+                                self._run_mock_std_server):
+      threads = self._run_multiple_coordinator_in_threads(
+          self._dump_worker_context,
+          MockStrategy(between_graph=True),
+          cluster_spec,
+          mode=INDEPENDENT_WORKER,
+          rpc_layer=None)
+      for task_id in range(NUM_WORKERS):
+        threads[WORKER][task_id].join()
+
+    # There is only one type of task and three such tasks.
+    self.assertEqual(len(self._worker_context), 1)
+    self.assertTrue(WORKER in self._worker_context)
+    self.assertEqual(len(self._worker_context[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._worker_context[WORKER][0],
+        (_bytes_to_str(cluster_spec[WORKER][0]), NUM_WORKERS, True, True))
+    self.assertEqual(
+        self._worker_context[WORKER][1],
+        (_bytes_to_str(cluster_spec[WORKER][1]), NUM_WORKERS, False, True))
+    self.assertEqual(
+        self._worker_context[WORKER][2],
+        (_bytes_to_str(cluster_spec[WORKER][2]), NUM_WORKERS, False, True))
+
+    # Make sure each worker runs a std server.
+    self.assertEqual(len(self._std_servers), 1)
+    self.assertTrue(WORKER in self._std_servers)
+    self.assertEqual(len(self._std_servers[WORKER]), 3)
+    self.assertFalse(self._std_servers[WORKER][0].joined)
+    self.assertFalse(self._std_servers[WORKER][1].joined)
+    self.assertFalse(self._std_servers[WORKER][2].joined)
+
+  def testBetweenGraphStrategyProperties(self):
+    cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
+    # Dumps properties of the strategy objects.
+    with test.mock.patch.object(distribute_coordinator, "_run_std_server",
+                                self._run_mock_std_server):
+      threads = self._run_multiple_coordinator_in_threads(
+          self._dump_strategy_property,
+          MockStrategy(between_graph=True, should_init=True),
+          cluster_spec,
+          mode=INDEPENDENT_WORKER,
+          rpc_layer=None)
+      for task_id in range(NUM_WORKERS):
+        threads[WORKER][task_id].join()
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._strategy_property), 1)
+    self.assertTrue(WORKER in self._strategy_property)
+    self.assertEqual(len(self._strategy_property[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right properties of should_init,
+    # should_checkpoint and should_save_summary.
+    self.assertEqual(self._strategy_property[WORKER][0], (True, True, True))
+    self.assertEqual(self._strategy_property[WORKER][1], (True, False, False))
+    self.assertEqual(self._strategy_property[WORKER][2], (True, False, False))
+
+  def testInGraphContext(self):
+    cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
+    # Dumps the task contexts and std server arguments.
+    with test.mock.patch.object(distribute_coordinator, "_run_std_server",
+                                self._run_mock_std_server):
+      threads = self._run_multiple_coordinator_in_threads(
+          self._dump_worker_context,
+          MockStrategy(between_graph=False),
+          cluster_spec,
+          mode=INDEPENDENT_WORKER,
+          rpc_layer=None)
+      for task_id in range(NUM_WORKERS):
+        threads[WORKER][task_id].join()
+
+    # There is only a "None" task in the dumped task context.
+    self.assertEqual(len(self._worker_context), 1)
+    self.assertTrue("None" in self._worker_context)
+    self.assertEqual(len(self._worker_context["None"]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._worker_context["None"][0],
+        (_bytes_to_str(cluster_spec[WORKER][0]), NUM_WORKERS, True, True))
+
+    # Make sure each worker runs a std server.
+    self.assertEqual(len(self._std_servers), 1)
+    self.assertTrue(WORKER in self._std_servers)
+    self.assertEqual(len(self._std_servers[WORKER]), 3)
+    self.assertFalse(self._std_servers[WORKER][0].joined)
+    self.assertTrue(self._std_servers[WORKER][1].joined)
+    self.assertTrue(self._std_servers[WORKER][2].joined)
+
+  def testInGraphContextWithEval(self):
+    # Adds a EVALUATOR job.
+    cluster_spec = self._create_cluster_spec(
+        num_workers=NUM_WORKERS, has_eval=True)
+
+    # Dumps the task contexts and std server arguments.
+    with test.mock.patch.object(distribute_coordinator, "_run_std_server",
+                                self._run_mock_std_server):
+      threads = self._run_multiple_coordinator_in_threads(
+          self._dump_worker_context,
+          MockStrategy(between_graph=False),
+          cluster_spec,
+          mode=INDEPENDENT_WORKER,
+          rpc_layer=None)
+      for task_id in range(NUM_WORKERS):
+        threads[WORKER][task_id].join()
+      threads[EVALUATOR][0].join()
 
     # There are one "None" task and one EVALUATOR task.
     self.assertEqual(len(self._worker_context), 2)
@@ -284,10 +715,23 @@
     # Check whether each task has the right master_target, num_workers, is_chief
     # and distributed_mode.
     self.assertEqual(self._worker_context["None"][0],
-                     (_bytes_to_str(self._workers[0].target), 3, True, True))
+                     (_bytes_to_str(cluster_spec[WORKER][0]), 3, True, True))
     self.assertEqual(self._worker_context[EVALUATOR][0],
-                     ("fake_evaluator", 3, False, True))
+                     (cluster_spec[EVALUATOR][0], 3, True, False))
+
+    # Make sure each worker runs a std server.
+    self.assertEqual(len(self._std_servers), 2)
+    self.assertTrue(WORKER in self._std_servers)
+    self.assertTrue(EVALUATOR in self._std_servers)
+    self.assertEqual(len(self._std_servers[WORKER]), 3)
+    self.assertEqual(len(self._std_servers[EVALUATOR]), 1)
+    self.assertFalse(self._std_servers[WORKER][0].joined)
+    self.assertTrue(self._std_servers[WORKER][1].joined)
+    self.assertTrue(self._std_servers[WORKER][2].joined)
+    self.assertFalse(self._std_servers[EVALUATOR][0].joined)
 
 
 if __name__ == "__main__":
-  test.main()
+  # TODO(yuefengz): find a smart way to terminite std server threads.
+  with test.mock.patch.object(sys, "exit", os._exit):
+    test.main()
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
new file mode 100644
index 0000000..360733e
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for multi-worker distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.training import server_lib
+
+
+def normalize_cluster_spec(cluster_spec):
+  """Makes `cluster_spec` into a `ClusterSpec` object.
+
+  Args:
+    cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+      cluster configurations.
+
+  Returns:
+    a `ClusterSpec` object.
+
+  Raises:
+    ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a
+      `ClusterDef`.
+  """
+  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+    return server_lib.ClusterSpec(cluster_spec)
+  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+    raise ValueError(
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object")
+  return cluster_spec
+
+
+def is_chief(cluster_spec, task_type, task_id):
+  """Returns whether the given task is chief in the cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object specifying the
+      cluster configurations.
+    task_type: the task type in the cluster.
+    task_id: the task id in the cluster.
+
+  Returns:
+    a boolean indicating whether the given task is chief.
+
+  Raises:
+    ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
+      the maximum id of the `task_type`.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec)
+  if task_type not in cluster_spec.jobs:
+    raise ValueError(
+        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
+  if task_id >= cluster_spec.num_tasks(task_type):
+    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
+        task_id, task_type))
+
+  if task_type == "chief":
+    return True
+
+  # If chief not in the cluster_spec, use the first worker as chief. This is
+  # common in CollectiveAllReduceStrategy.
+  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
+      task_id == 0):
+    return True
+  return False
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
new file mode 100644
index 0000000..bdc4972
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi_worker_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.eager import test
+from tensorflow.python.training import server_lib
+
+
+class NormalizeClusterSpecTest(test.TestCase):
+
+  def assert_same_cluster(self, lhs, rhs):
+    self.assertEqual(
+        server_lib.ClusterSpec(lhs).as_dict(),
+        server_lib.ClusterSpec(rhs).as_dict())
+
+  def testDictAsInput(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assert_same_cluster(
+        cluster_spec, multi_worker_util.normalize_cluster_spec(cluster_spec))
+
+  def testClusterDefAsInput(self):
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = "chief"
+    job.tasks[0] = "127.0.0.1:1234"
+
+    job = cluster_def.job.add()
+    job.name = "worker"
+    job.tasks[0] = "127.0.0.1:8964"
+    job.tasks[1] = "127.0.0.1:2333"
+
+    job = cluster_def.job.add()
+    job.name = "ps"
+    job.tasks[0] = "127.0.0.1:1926"
+    job.tasks[1] = "127.0.0.1:3141"
+
+    self.assert_same_cluster(
+        cluster_def, multi_worker_util.normalize_cluster_spec(cluster_def))
+
+  def testClusterSpecAsInput(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    })
+    self.assert_same_cluster(
+        cluster_spec, multi_worker_util.normalize_cluster_spec(cluster_spec))
+
+  def testUnexpectedInput(self):
+    cluster_spec = ["127.0.0.1:8964", "127.0.0.1:2333"]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object"):
+      multi_worker_util.normalize_cluster_spec(cluster_spec)
+
+
+class IsChiefTest(test.TestCase):
+
+  def testClusterWithChief(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "chief", 0))
+    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 0))
+
+  def testClusterWithoutChief(self):
+    cluster_spec = {"worker": ["127.0.0.1:8964", "127.0.0.1:2333"]}
+    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0))
+    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
+
+    with self.assertRaisesRegexp(
+        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+      multi_worker_util.is_chief(cluster_spec, "chief", 0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "The `task_id` 2 exceeds the maximum id of worker."):
+      multi_worker_util.is_chief(cluster_spec, "worker", 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 5f60f62..553f761 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -646,7 +646,7 @@
   Operations are recorded if they are executed within this context manager and
   at least one of their inputs is being "watched".
 
-  Trainable variables (created by `tf.Variable` or @{tf.get_variable},
+  Trainable variables (created by `tf.Variable` or `tf.get_variable`,
   trainable=True is default in both cases) are automatically watched. Tensors
   can be manually watched by invoking the `watch` method on this context
   manager.
@@ -705,6 +705,7 @@
     self._tape = None
     self._persistent = persistent
     self._recording = False
+    context.context().start_step()
 
   def __enter__(self):
     """Enters a context inside which operations are recorded on this tape."""
@@ -733,6 +734,9 @@
     tape.pop_tape(self._tape)
     self._recording = False
 
+  def __del__(self):
+    context.context().end_step()
+
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 1a78559..e2b1890 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -77,19 +77,54 @@
 
   def __init__(self):
     super(SubclassedKerasModel, self).__init__()
-    self.layer = keras.layers.Dense(
+    self.layer_a = keras.layers.Dense(
+        64, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_b = keras.layers.Dense(
+        128, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_c = keras.layers.Dense(
+        256, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_d = keras.layers.Dense(
+        256, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_e = keras.layers.Dense(
         10, kernel_initializer="ones", bias_initializer="zeros")
 
   def call(self, x):
-    return self.layer(x)
+    x = self.layer_a(x)
+    x = self.layer_b(x)
+    x = self.layer_c(x)
+    x = self.layer_d(x)
+    return self.layer_e(x)
 
 
 def make_keras_model():
-  x = keras.Input(shape=(10,))
-  y = keras.layers.Dense(
-      10, kernel_initializer="ones", bias_initializer="zeros")(
-          x)
-  return keras.Model(inputs=x, outputs=y)
+  model_input = keras.Input(shape=(10,))
+  x = keras.layers.Dense(
+      64, kernel_initializer="ones", bias_initializer="zeros")(model_input)
+  x = keras.layers.Dense(
+      128, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros")(x)
+  return keras.Model(inputs=model_input, outputs=x)
+
+
+def make_sequential_keras_model():
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(
+      64, kernel_initializer="ones", bias_initializer="zeros",
+      input_shape=(10,)))
+  model.add(keras.layers.Dense(
+      128, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros"))
+  return model
 
 
 class MicroBenchmarks(test.Benchmark):
@@ -638,6 +673,15 @@
     assert np.equal(func(), SubclassedKerasModel()(data)).all()
     self._run(func, 30000)
 
+  def benchmark_keras_model_sequential(self):
+    model = make_sequential_keras_model()
+    data = random_ops.random_uniform((10, 10))
+    func = lambda: model(data)
+    # Symmetry with benchmark_keras_model_functional
+    func()
+    assert np.equal(func(), make_keras_model()(data)).all()
+    self._run(func, 30000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index c792948..6a327bd 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -265,7 +265,7 @@
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       if self._server_def is not None:
         server_def_str = self._server_def.SerializeToString()
-        pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle,
+        pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle, 600,
                                                   server_def_str)
 
       self._initialize_devices()
@@ -275,7 +275,7 @@
     self.ones_rank_cache().flush()
     self.zeros_cache().flush()
 
-  def set_server_def(self, server_def):
+  def set_server_def(self, server_def, keep_alive_secs=600):
     """Allow setting a server_def on the context.
 
     When a server def is replaced, it effectively clears a bunch of caches
@@ -285,6 +285,11 @@
     Args:
       server_def: A tensorflow::ServerDef proto.
         Enables execution on remote devices.
+      keep_alive_secs: Num. seconds after which the remote end will hang up.
+        As long as the client is still alive, the server state for the context
+        will be kept alive. If the client is killed (or there is some failure),
+        the server will clean up its context keep_alive_secs after the final RPC
+        it receives.
 
     Raises:
       ValueError: if server_def is None.
@@ -296,7 +301,7 @@
     else:
       server_def_str = server_def.SerializeToString()
       pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle,
-                                                server_def_str)
+                                                keep_alive_secs, server_def_str)
 
       # Clear all the caches in case there are remote tensors in them.
       self._clear_caches()
@@ -603,6 +608,12 @@
     """Returns a stack of context switches."""
     return self._context_switches
 
+  def start_step(self):
+    pywrap_tensorflow.TFE_ContextStartStep(self._handle)
+
+  def end_step(self):
+    pywrap_tensorflow.TFE_ContextEndStep(self._handle)
+
 _context = None
 _context_lock = threading.Lock()
 
@@ -652,7 +663,7 @@
 def executing_eagerly():
   """Returns True if the current thread has eager execution enabled.
 
-  Eager execution is typically enabled via @{tf.enable_eager_execution},
+  Eager execution is typically enabled via `tf.enable_eager_execution`,
   but may also be enabled within the context of a Python function via
   tf.contrib.eager.py_func.
   """
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f315fa2..189eb80 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -42,7 +42,8 @@
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import distribute
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -83,26 +84,30 @@
 
 def capture_value(tensor_map, value, dtype, name):
   """Capture a value from outside the function, to pass in as an extra arg."""
-  captured_tuple = tensor_map.get(ops.tensor_id(value), None)
-  if captured_tuple is None:
+  captured_value = tensor_map.get(value, None)
+  if captured_value is None:
     captured_value = create_substitute_placeholder(value, name=name,
                                                    dtype=dtype)
-    tensor_map[ops.tensor_id(value)] = (value, captured_value)
-  else:
-    captured_value = captured_tuple[1]
+    tensor_map[value] = captured_value
   tape.record_operation("captured_value", [captured_value], [value],
                         lambda x: [x])
   return captured_value
 
 
 class CapturingGraph(ops.Graph):
-  """Graph used when constructing eager functions."""
+  """Graph that can capture tensors from other graphs.
+
+  Attributes:
+    captures: Maps external tensor -> internal tensor (e.g. input placeholder).
+      The entries are in the order they were captured.
+  """
 
   def __init__(self):
     super(CapturingGraph, self).__init__()
+
+    self.captures = collections.OrderedDict()
     self._building_function = True
-    # Maps external tensor id -> internal tensor (e.g. input placeholder).
-    self.captures = {}
+
     # Map from resource tensor name to last op (in program order) which uses
     # this tensor. Used to enforce that execution order matches program order
     # for resource tensors.
@@ -115,7 +120,22 @@
   def clear_resource_control_flow_state(self):
     self._last_op_using_resource_tensor = {}
 
+  # TODO(skyewm): get rid of name and use the name of `tensor`.
   def capture(self, tensor, name=None):
+    """Capture `tensor` if it's external to this graph.
+
+    If `tensor` is from a different graph, returns a placeholder for it.
+    `tensor` and the placeholder will also appears in self.captures. Multiple
+    calls to this method with the same `tensor` argument will return the same
+    placeholder. If `tensor` is from this graph, returns `tensor`.
+
+    Args:
+      tensor: Tensor. May be from this FuncGraph or a different graph.
+      name: Optional name if a placeholder is created.
+
+    Returns:
+      Tensor from this FuncGraph.
+    """
     if isinstance(tensor, ops.EagerTensor):
       if name is None:
         name = str(ops.uid())
@@ -137,6 +157,7 @@
       op_def=None,
       compute_shapes=True,
       compute_device=True):
+    """Captures an external inputs before calling Graph.capture_op."""
     # This capturing logic interacts poorly with control flow contexts which
     # want to replace inputs of ops far too late in the process. This can lead
     # the context to get confused and try to create an Enter for an Enter. We
@@ -159,76 +180,71 @@
         compute_device=compute_device)
 
 
-# pylint: disable=invalid-name
-class HelperContext(object):
-  """ControlFlowContext with a customizable AddOp method."""
+class FuncGraph(CapturingGraph):
+  """Graph representing a function body.
 
-  def __init__(self, add_op_internal):
-    self._add_op_internal = add_op_internal
-    self._values = set()  # control flow code sometimes updates this.
+  Attributes:
+    name: The name of the function.
 
-  def _AddOpInternal(self, op):
-    self._add_op_internal(op)
+    inputs: Placeholder tensors representing the inputs to this function. The
+      tensors are in this FuncGraph. This represents "regular" inputs as well as
+      captured inputs (i.e. the values of self.captures), with the regular
+      inputs coming first.
+    outputs: Tensors that will be returned by this function. The tensors are in
+      this FuncGraph.
+    structured_outputs: A possibly-nested python object which will be returned
+      by this function. The Tensors in this structure are the same as those of
+      self.outputs. Note that this structure might contain Python `None`s.
+    variables: Variables that should be watched during function execution.
+    seed: The graph-level random seed.
+  """
 
-  @property
-  def outer_context(self):
-    return self._outer_context
+  def __init__(self, name, graph=None):
+    """Construct a new FuncGraph.
 
-  def GetWhileContext(self):
-    if self._outer_context:
-      return self._outer_context.GetWhileContext()
+    Args:
+      name: the name of the function.
+      graph: if specified, this FuncGraph will inherit its graph key,
+        collections, and seed from `graph`.
+    """
+    super(FuncGraph, self).__init__()
 
-  def IsWhileContext(self):
-    return False
+    self.name = name
+    self.inputs = []
+    self.outputs = []
+    self.structured_outputs = None
+    self.variables = []
 
-  def IsCondContext(self):
-    return False
+    if graph is not None:
+      # Inherit the graph key, since this is used for matching variables in
+      # optimizers.
+      self._graph_key = graph._graph_key  # pylint: disable=protected-access
 
-  def IsXLAContext(self):
-    return False
+      # Copy the graph collections to ensure summaries and other things work.
+      # This lets the function access (but not mutate) collections of the
+      # containing graph, such as the global step and the summary writer
+      # collections.
+      for collection in graph.collections:
+        self.get_collection_ref(collection)[:] = graph.get_collection(
+            collection)
 
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    self._AddOpInternal(op)
-    if self._outer_context:
-      self._outer_context.AddOp(op)
+      # Copy distribution strategy scope from the containing graph as well.
+      self._distribution_strategy_stack = graph._distribution_strategy_stack  # pylint: disable=protected-access
 
-  def AddName(self, _):
-    pass
+      if context.executing_eagerly():
+        self.seed = context.global_seed()
+      else:
+        self.seed = graph.seed
 
-  def AddInnerOp(self, op):
-    self._AddOpInternal(op)
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
+  def capture(self, tensor, name=None):
+    """Calls CapturingGraph.capture and updates self.inputs if necessary."""
+    new_capture = tensor not in self.captures
+    internal_tensor = super(FuncGraph, self).capture(tensor, name)
 
-  def AddValue(self, val):
-    if self._outer_context:
-      return self._outer_context.AddValue(val)
-    else:
-      return val
+    if new_capture and tensor is not internal_tensor:
+      self.inputs.append(internal_tensor)
 
-  def EnterGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.EnterGradientColocation(op, gradient_uid)
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.ExitGradientColocation(op, gradient_uid)
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._outer_context = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    self._nested_contexts = (
-        self._outer_context._nested_contexts
-        if self._outer_context is not None else None)
-    # pylint: enable=protected-access
-
-  def __exit__(self, *_):
-    self._g._set_control_flow_context(self._outer_context)  # pylint: disable=protected-access
-# pylint: enable=invalid-name
+    return internal_tensor
 
 
 def _forward_name(n):
@@ -395,11 +411,6 @@
       return outputs
 
 
-def _map_sequence_obj_to_idx(sequence):
-  """Maps objs in the sequence from id(obj) to sequence index."""
-  return {id(x): i for i, x in enumerate(sequence)}
-
-
 def _flatten(sequence):
   """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
@@ -474,6 +485,7 @@
     self._func_name = name
     self._function_def = defined_function
     self._num_outputs = len(defined_function.signature.output_arg)
+    self._outputs = outputs
     self._python_func_outputs = python_func_outputs
     self._python_returns = [python_func_outputs] if isinstance(
         python_func_outputs,
@@ -484,7 +496,7 @@
     # Find the variables that are components of something distributed and
     # put them into a {handle_tensor -> distributed variable object} map.
     self._distributed_variables = {}
-    strategy = distribute.get_distribution_strategy()
+    strategy = distribution_strategy_context.get_distribution_strategy()
     for variable in self._variables:
       # If variable is not distributed, unwrap returns [variable].
       component_variables = strategy.unwrap(variable)
@@ -501,59 +513,52 @@
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
-    filtered_outputs = [x for x in self._python_returns if x is not None]
-    backwards_graph = CapturingGraph()
-    backwards_graph._graph_key = self._graph._graph_key  # pylint: disable=protected-access
-    for collection in self._graph.collections:
-      backwards_graph.get_collection_ref(
-          collection)[:] = self._graph.get_collection(collection)
-    backwards_graph.seed = self._graph.seed
+    backwards_graph = FuncGraph(_backward_name(self._func_name), self._graph)
     with backwards_graph.as_default():
-      self._out_grad_placeholders = [
-          graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
+      out_grad_placeholders = [
+          graph_placeholder(x.dtype, x.shape) for x in self._outputs]
       in_gradients = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
-          filtered_outputs,
+          self._outputs,
           self._input_placeholders,
-          grad_ys=self._out_grad_placeholders,
+          grad_ys=out_grad_placeholders,
           src_graph=self._graph)
 
-    backward_outputs = tuple(
-        grad for grad in _flatten(in_gradients) if grad is not None)
-    output_shapes = tuple(grad.shape for grad in backward_outputs)
-
-    captures = backwards_graph.captures
-    ids = list(sorted(captures.keys()))
-    if ids:
-      extra_inputs, extra_placeholders = zip(*[captures[x] for x in ids])
-    else:
-      extra_inputs = []
-      extra_placeholders = []
-
-    forward_name = _forward_name(self._func_name)
     # Note: we cannot have placeholder ops in the graph or the TPU compilation
     # pass fails.
     placeholder_ops = set([y.op for y in self._input_placeholders])
     function_ops = [x for x in self._graph.get_operations()
                     if x not in placeholder_ops]
     self._forward_fdef = _EagerDefinedFunction(
-        forward_name, self._graph, function_ops,
-        self._input_placeholders, filtered_outputs + list(extra_inputs),
-        self._attrs)
-    all_inputs = self._out_grad_placeholders + list(extra_placeholders)
+        _forward_name(self._func_name), self._graph, function_ops,
+        self._input_placeholders,
+        self._outputs + list(backwards_graph.captures.keys()), self._attrs)
+
+    # The ordering of `backwards_graph.inputs` is important: inputs of
+    # `self._backward_function` correspond to outputs of `self._forward_fdef`.
+    backwards_graph.inputs = out_grad_placeholders + list(
+        backwards_graph.captures.values())
+    backwards_graph.outputs.extend(
+        grad for grad in _flatten(in_gradients) if grad is not None)
+    backwards_graph.structured_outputs = in_gradients
+    output_shapes = tuple(grad.shape for grad in backwards_graph.outputs)
+
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
-    all_ignored_ops = frozenset(x.op for x in all_inputs)
-    # Enforce a deterministic order of operations in the generated graph. This
-    # means rerunning the function-defining code will always define the same
-    # function, which is useful if we serialize this etc.
-    function_def_ops = tuple(x
-                             for x in sorted(backwards_graph.get_operations(),
-                                             key=lambda x: x.name)
-                             if x not in all_ignored_ops)
-    bname = _backward_name(self._func_name)
+    ignored_ops = frozenset(x.op for x in backwards_graph.inputs)
+    # `get_operations` enforces a deterministic order on operations.
+    operations = tuple(op for op in backwards_graph.get_operations()
+                       if op not in ignored_ops)
+
     self._backward_function = GraphModeFunction(
-        bname, all_inputs, [], backwards_graph, function_def_ops,
-        backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
+        backwards_graph.name,
+        backwards_graph.inputs,
+        backwards_graph.variables,
+        backwards_graph,
+        operations,
+        backwards_graph.outputs,
+        backwards_graph.structured_outputs,
+        output_shapes,
+        attrs=self._attrs)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape.
@@ -744,24 +749,16 @@
 
   Returns:
     A GraphModeFunction.
+
+  Raises:
+    TypeError: If any of `python_func`'s return values is neither `None` nor a
+      `Tensor`.
   """
-  graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  func_graph = CapturingGraph()
-  # Inherit the graph key, since this is used for matching variables in
-  # optimizers.
-  func_graph._graph_key = graph_key  # pylint: disable=protected-access
-  # Copy the graph collections to ensure summaries and other things work. This
-  # lets the function access (but not mutate) collections of the containing
-  # graph, such as the global step and the summary writer collections.
-  curr_graph = ops.get_default_graph()
-  for collection in curr_graph.collections:
-    func_graph.get_collection_ref(collection)[:] = curr_graph.get_collection(
-        collection)
-  if context.executing_eagerly():
-    func_graph.seed = context.global_seed()
-  else:
-    func_graph.seed = curr_graph.seed
+  func_graph = FuncGraph(_inference_name(name), graph=ops.get_default_graph())
+
   with func_graph.as_default(), AutomaticControlDependencies() as a:
+    variable_scope.get_variable_scope().set_use_resource(True)
+
     if signature is None:
       func_args = _get_defun_inputs_from_args(args)
       func_kwds = _get_defun_inputs_from_args(kwds)
@@ -769,15 +766,29 @@
       func_args = _get_defun_inputs_from_signature(signature)
       func_kwds = {}
 
+    # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
+    func_graph.inputs.extend(
+        x for x in nest.flatten(func_args) + nest.flatten(func_kwds)
+        if isinstance(x, ops.Tensor)
+    )
+
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
     func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
     func_kwds_before = nest.pack_sequence_as(func_kwds, nest.flatten(func_kwds))
 
     def convert(x):
+      """Converts an argument to a Tensor."""
       if x is None:
         return None
-      x = ops.convert_to_tensor_or_indexed_slices(x)
+      try:
+        x = ops.convert_to_tensor_or_indexed_slices(x)
+      except (ValueError, TypeError):
+        raise TypeError(
+            "To be compatible with tf.contrib.eager.defun, Python functions "
+            "must return zero or more Tensors; in compilation of %s, found "
+            "return value of type %s, which is not a Tensor." %
+            (str(python_func), type(x)))
       x = a.mark_as_return(x)
       return x
 
@@ -806,6 +817,7 @@
 
     finally:
       tape.pop_tape(this_tape)
+    func_graph.structured_outputs = func_outputs
     variables = list(this_tape.watched_variables())
 
     # Some variables captured by the tape can come from a DistributedValue.
@@ -813,38 +825,25 @@
     # the function is run on a different device). Thus, instead of storing
     # the specific captured variable, we replace it with its distributed
     # container.
-    strategy = distribute.get_distribution_strategy()
+    strategy = distribution_strategy_context.get_distribution_strategy()
     for i, variable in enumerate(variables):
       # If variable is not distributed value_container returns itself.
       variables[i] = strategy.value_container(variable)
 
+    func_graph.variables = variables
+
     # Returning a closed-over tensor as an output does not trigger a
     # call to convert_to_tensor, so we manually capture all such tensors.
-    outputs_list = _flatten(func_outputs)
-    func_def_outputs = [
-        func_graph.capture(x) for x in outputs_list
+    func_graph.outputs.extend(
+        func_graph.capture(x) for x in _flatten(func_graph.structured_outputs)
         if x is not None
-    ]
+    )
 
-    captures = func_graph.captures
-    ids = list(sorted(captures.keys()))
-    if ids:
-      extra_inputs, extra_placeholders = zip(* [captures[x] for x in ids])
-    else:
-      extra_inputs = []
-      extra_placeholders = []
     output_shapes = tuple(
         x.shape if isinstance(x, ops.Tensor) else None
-        for x in func_def_outputs)
+        for x in func_graph.outputs)
 
-  # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
-  flat_inputs = [
-      x for x in nest.flatten(func_args) + nest.flatten(func_kwds)
-      if isinstance(x, ops.Tensor)
-  ]
-  all_inputs = flat_inputs + list(extra_placeholders)
-  all_ignored_ops = frozenset(x.op for x in all_inputs)
-  fname = _inference_name(name)
+  all_ignored_ops = frozenset(x.op for x in func_graph.inputs)
   operations = tuple(x for x in func_graph.get_operations()
                      if x not in all_ignored_ops)
   # Register any other functions defined in the graph
@@ -859,8 +858,9 @@
     attrs[_xla_compile_attr] = attr_value_pb2.AttrValue(b=True)
 
   return GraphModeFunction(
-      fname, all_inputs, extra_inputs, func_graph, operations, func_def_outputs,
-      func_outputs, output_shapes, variables, attrs)
+      func_graph.name, func_graph.inputs, func_graph.captures.keys(),
+      func_graph, operations, func_graph.outputs, func_graph.structured_outputs,
+      output_shapes, func_graph.variables, attrs)
 
 
 _TensorType = collections.namedtuple("_TensorType", ["dtype", "shape"])
@@ -1148,7 +1148,7 @@
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
-  composed of TensorFlow operations into a callable that executes a @{tf.Graph}
+  composed of TensorFlow operations into a callable that executes a `tf.Graph`
   containing those operations. The callable produced by `defun` contains only
   the subgraph of TensorFlow operations that were executed when the Python
   function was called with a particular input signature, defined as a list
@@ -1171,9 +1171,9 @@
   For a Python function to be compatible with `defun`, all of its arguments must
   be hashable Python objects or lists thereof. The function itself may not
   modify the list/map structure of its arguments. Additionally, it must return
-  zero or more @{tf.Tensor} objects. If the Python function returns
-  a @{tf.Variable}, its compiled version will return the value of that variable
-  as a @{tf.Tensor}.
+  zero or more `tf.Tensor` objects. If the Python function returns
+  a `tf.Variable`, its compiled version will return the value of that variable
+  as a `tf.Tensor`.
 
   Executing a graph generated by `defun` respects device annotations (i.e.,
   all `with tf.device` directives present in a Python function will also be
@@ -1242,25 +1242,67 @@
 
   When using `defun`, there are subtleties regarding inputs, Python control
   flow, and variable creation that one should be aware of. For concreteness, let
-  `f` be a Python function that returns zero or more @{tf.Tensor} objects and
+  `f` be a Python function that returns zero or more `tf.Tensor` objects and
   let `F = defun(f)`. `F` builds a graph for each unique input signature it
   sees, Python control flow is baked into graphs, and operations related to
   variable initialization are automatically lifted out of the graphs that `F`
   generates and placed in the eager context if executing eagerly or into an
   outer graph otherwise.
 
-  _Tracing and Input Signatures_.
-  The signature of inputs supplied to `F` is defined to be a tuple of the shapes
-  and dtypes of Tensor-typed arguments and the values of non-Tensor arguments,
-  where "arguments" includes both args and kwargs. Every time `F` is invoked,
-  the signature of its inputs are inferred. The first time `F(*args, **kwargs)`
-  is invoked with a particular signature, `f(*args, **kwargs)` is executed and
-  all the TensorFlow operations that `f` executes, along with the Tensors that
-  flow between them, are recorded in a TensorFlow graph. `F` caches this graph
-  and binds it to the inputs' signature; every subsequent invocation of `F` with
-  inputs conforming to this signature will immediately retrieve the cached graph
-  and pass it to the TensorFlow runtime for execution.
+  _Input Signatures_
+  By default, `F = tf.contrib.eager.defun(f)` instantiates a separate graph
+  for every unique sequence of the shapes and dtypes of Tensor arguments and
+  the values of Python objects it is invoked with. For example, calling
+  `F(tf.random_uniform([2])` will execute a different graph than
+  `F(tf.random_uniform([3])` because the two inputs have different shapes.
+  The first time that `F(*args, **kwargs)` is called with a particular sequence
+  of Tensor shapes and dtypes and Python values, it constructs a graph by
+  tracing the execution of `f(*args, **kwargs)`; this graph is bound to an
+  input signature inferred from `(*args, **kwargs)` and cached for future reuse.
 
+  `tf.contrib.eager.defun` caches graphs for your convenience, letting you
+  define TensorFlow functions without explicitly specifying their signatures.
+  However, this policy is conservative and potentially expensive; for example,
+  when different invocations of your function have differently-shaped Tensor
+  inputs, this policy might generate more graph functions than necessary. To
+  eliminate such costs, `tf.contrib.eager.defun` allows you to supply an
+  optional `input_signature` argument specifying the shapes and dtypes of the
+  inputs. In particular, the shapes may be partially unspecified, with `None`s
+  in the unknown dimensions.  When an input signature is provided,
+  `tf.contrib.eager.defun` will only instantiate a single graph for the
+  decorated Python function. The following is an example:
+
+  ```python
+  import tensorflow as tf
+
+  # The first `TensorSpec` below describes the shape and dtype of `words`,
+  # and the second describes the shape and dtype of `another_tensor`. Note that
+  # the last dimension of the `words` `TensorSpec` is left unspecified.
+  @tf.contrib.eager.defun(input_signature=[
+    tf.contrib.eager.TensorSpec(shape=[50, 300, None], dtype=tf.float32),
+    tf.contrib.eager.TensorSpec(shape=[300, 100], dtype=tf.float32)
+  ])
+  def my_sequence_model(words, another_tensor):
+    ...
+
+  # Note how the third dimension of the first input can vary freely.
+  words = tf.random_uniform(([50, 300, 10])
+  second_input = tf.random_uniform([300, 100])
+  my_sequence_model(words, second_input)
+
+  words = tf.random_uniform(([50, 300, 20])
+  my_sequence_model(words, second_input)
+
+  # Passing an input with an incompatible shape will raise an error.
+  words = tf.random_uniform(([50, 100, 20])
+  my_sequence_model(words, second_input)  # <---- This will raise an error.
+
+  ```
+
+  Python functions that are compiled with an `input_signature` must only accept
+  Tensors as arguments and must not take unnamed keyword arguments (**kwargs).
+
+  _Tracing_
   Be aware that because `F` only logs TensorFlow operations, all the other
   Python code that `f` executes will only shape the _construction_ of the graphs
   that `F` executes: the Python code won't be executed when the graphs
@@ -1325,10 +1367,10 @@
   On the other hand, because `defun` generates graphs by tracing and not by
   source code analysis, it fully unrolls Python `for` and `while` loops,
   potentially creating large graphs. If your Python function has native loops
-  that run for many iterations, consider replacing them with @{tf.while_loop}
+  that run for many iterations, consider replacing them with `tf.while_loop`
   operations.
 
-  When constructing graphs, @{tf.Tensor} objects cannot be used as Python
+  When constructing graphs, `tf.Tensor` objects cannot be used as Python
   `bool` objects. This means, for example, that you should replace code in `f`
   resembling
 
@@ -1347,7 +1389,7 @@
   automatically lifted out of the graphs generated by `defun`. In practice, this
   implies that variable creation and initialization only happen the first time
   `F` is called, and that variables are reused every time thereafter. Many
-  TensorFlow APIs, like @{tf.keras.layers.Layer} objects, create variables the
+  TensorFlow APIs, like `tf.keras.layers.Layer` objects, create variables the
   first time they are called and reuse them thereafter. Automatic variable
   lifting makes it possible to compile these APIs without extra effort, at the
   cost of introducing a discrepancy between the semantics of executing Python
@@ -1386,7 +1428,7 @@
   to reference the same set of variables, add logic to your Python function that
   ensures that variables are only created the first time it is called and are
   reused for every subsequent invocation; note that this is precisely what
-  @{tf.keras.layers.Layer} objects do, so we recommend using them to represent
+  `tf.keras.layers.Layer` objects do, so we recommend using them to represent
   variable-bearing computations whenever possible.
 
   Args:
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index b7c9334..380bcf7 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -19,6 +19,7 @@
 
 import collections
 import functools
+from multiprocessing.pool import ThreadPool
 import sys
 
 from tensorflow.core.protobuf import config_pb2
@@ -143,6 +144,61 @@
     out = sq_op(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
+  def testExecutingStatelessDefunConcurrently(self):
+
+    @function.defun
+    def stateless(x):
+      return math_ops.multiply(2.0, x)
+
+    pool = ThreadPool()
+    inputs = [constant_op.constant(1.0 * x) for x in range(100)]
+    outputs = [float(out) for out in pool.map(stateless, inputs)]
+    expected = [float(2.0 * x) for x in inputs]
+    self.assertSequenceEqual(outputs, expected)
+
+  def testExecutingManyStatelessDefunsConcurrently(self):
+
+    @function.defun
+    def stateless(x):
+      del x
+      return math_ops.multiply(2.0, 2.0)
+
+    pool = ThreadPool()
+    # `pool.map` below instantiates 100 functions, one for each object.
+    outputs = [
+        float(out)
+        for out in pool.map(stateless, [object() for _ in range(100)])
+    ]
+    expected = [4.0] * 100
+    self.assertSequenceEqual(outputs, expected)
+
+  def testExecutingStatefulDefunConcurrently(self):
+
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def stateful(x):
+      v.assign(x)
+
+    pool = ThreadPool()
+    inputs = [constant_op.constant(0.0)] * 100
+    pool.map(stateful, inputs)
+    self.assertEqual(float(v.read_value()), 0.0)
+
+  def testExecutingManyStatefulDefunsConcurrently(self):
+
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def stateful(x):
+      del x
+      return v.assign(0.0)
+
+    pool = ThreadPool()
+    # `pool.map` below instantiates 100 functions, one for each object.
+    pool.map(stateful, [object() for _ in range(100)])
+    self.assertEqual(float(v.read_value()), 0.0)
+
   def disabled_testRandomSeed(self):
 
     @function.defun
@@ -232,8 +288,6 @@
 
   @test_util.run_in_graph_and_eager_modes()
   def testGraphLoopGradient(self):
-    if context.executing_eagerly():
-      self.skipTest('TODO(apassos): support loops in defuns in eager')
 
     @function.defun
     def f(x):
@@ -343,6 +397,18 @@
       compiled = function.defun(f)
       compiled()
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDefunForcesResourceVariables(self):
+
+    def variable_creator():
+      return variables.Variable(0.0).read_value()
+
+    defined = function.defun(variable_creator)
+    defined()  # Create the variable.
+    self.assertEqual(len(defined.variables), 1)
+    self.assertIsInstance(
+        defined.variables[0], resource_variable_ops.ResourceVariable)
+
   def testDefunDifferentiable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
@@ -380,6 +446,22 @@
       op = call()
       self.assertAllEqual(sess.run(op), 2.0)
 
+  def testSymbolicGradientVariableZerosLike(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      @function.defun
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.test_session():
+        v.initializer.run()
+        self.assertAllEqual(dv.eval(), 0.0)
+
   def testGraphModeManyFunctions(self):
     with context.graph_mode(), self.test_session():
 
@@ -878,9 +960,12 @@
     y = model(x)
     self.assertAllEqual([[[[4.0]]]], y.numpy())
 
+  # Note: The ConfigProto below unfortunately only configures graph
+  # construction. Eager's configuration is controlled in `__main__`.
   @test_util.run_in_graph_and_eager_modes(
-      config=config_pb2.ConfigProto(device_count={'CPU': 3}))
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
   def testDeviceAnnotationsRespected(self):
+
     @function.defun
     def multi_device_fn():
       with ops.device('/cpu:0'):
@@ -892,12 +977,28 @@
       with ops.device('/cpu:2'):
         s3 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
-      return s1, s2, s3
+      with ops.device(''):
+        # TODO(akshayka): This is unfortunate and brittle. It prevents
+        # `Iterator.from_structure` from assigning the iterator op to 'cpu:0'.
+        #  Remove this hack once we have a way of obtaining metadata about
+        #  function execution.
+        s4 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      return s1, s2, s3, s4
 
-    outputs = multi_device_fn()
-    self.assertTrue(compat.as_bytes('CPU:0') in self.evaluate(outputs[0]))
-    self.assertTrue(compat.as_bytes('CPU:1') in self.evaluate(outputs[1]))
-    self.assertTrue(compat.as_bytes('CPU:2') in self.evaluate(outputs[2]))
+    with ops.device('/cpu:3'):
+      outputs = self.evaluate(multi_device_fn())
+    self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
+    self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
+    self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
+    self.assertIn(compat.as_bytes('CPU:3'), outputs[3])
+
+    with ops.device('/cpu:0'):
+      outputs = self.evaluate(multi_device_fn())
+    self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
+    self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
+    self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
+    self.assertIn(compat.as_bytes('CPU:0'), outputs[3])
 
   def testVariablesAreTracked(self):
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -1464,6 +1565,18 @@
     value = train()
     self.assertEqual(value.numpy(), -1.0)
 
+  def testReturningNonTensorRaisesError(self):
+    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
+    optimizer.apply_gradients = function.defun(optimizer.apply_gradients)
+    v = resource_variable_ops.ResourceVariable(1.0)
+    grad = backprop.implicit_grad(lambda v: v**2)(v)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 '.*must return zero or more Tensors.*'):
+      # TODO(akshayka): We might want to allow defun-ing Python functions
+      # that return operations (and just execute the op instead of running it).
+      optimizer.apply_gradients(grad)
+
   # TODO(b/111663004): This should work when the outer context is graph
   # building.
   def testOptimizerNonSlotVarsInDefunNoError(self):
@@ -1667,5 +1780,5 @@
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
-      config=config_pb2.ConfigProto(device_count={'CPU': 3}))
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
   test.main()
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 9200396..7105d2e 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -330,13 +330,9 @@
 
   sorted_variables = sorted(variable_captures.variables.values(),
                             key=lambda x: x.name)
-  captures = tmp_graph.captures
-  ids = list(sorted(captures.keys()))
-  if ids:
-    extra_inputs, extra_placeholders = zip(*[captures[x] for x in ids])
-  else:
-    extra_inputs = []
-    extra_placeholders = []
+
+  extra_inputs = tmp_graph.captures.keys()
+  extra_placeholders = tmp_graph.captures.values()
 
   flat_inputs = [x for x in nest.flatten(func_inputs)
                  if isinstance(x, tf_ops.Tensor)]
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 8b423f7..16928ca 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -703,9 +703,30 @@
     global_step = training_util.get_or_create_global_step()
     bucket_size_list, feature_ids_list = _group_features_by_num_buckets(
         sorted_feature_columns)
+    # Create Ensemble resources.
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+
+    # Create logits.
+    if mode != model_fn.ModeKeys.TRAIN:
+      input_feature_list = _get_transformed_features(features,
+                                                     sorted_feature_columns)
+      logits = boosted_trees_ops.predict(
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
+          bucketized_features=input_feature_list,
+          logits_dimension=head.logits_dimension)
+      return head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=control_flow_ops.no_op,
+          logits=logits)
+
+    # ============== Training graph ==============
     # Extract input features and set up cache for training.
     training_state_cache = None
-    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+    if train_in_memory:
       # cache transformed features as well for in-memory training.
       batch_size = array_ops.shape(labels)[0]
       input_feature_list, input_cache_op = (
@@ -717,63 +738,51 @@
     else:
       input_feature_list = _get_transformed_features(features,
                                                      sorted_feature_columns)
-      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
+      if example_id_column_name:
         example_ids = features[example_id_column_name]
         training_state_cache = _CacheTrainingStatesUsingHashTable(
             example_ids, head.logits_dimension)
 
-    # Create Ensemble resources.
-    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
     # Variable that determines whether bias centering is needed.
     center_bias_var = variable_scope.variable(
         initial_value=center_bias, name='center_bias_needed', trainable=False)
-    # Create logits.
-    if mode != model_fn.ModeKeys.TRAIN:
-      logits = boosted_trees_ops.predict(
-          # For non-TRAIN mode, ensemble doesn't change after initialization,
-          # so no local copy is needed; using tree_ensemble directly.
-          tree_ensemble_handle=tree_ensemble.resource_handle,
+    if is_single_machine:
+      local_tree_ensemble = tree_ensemble
+      ensemble_reload = control_flow_ops.no_op()
+    else:
+      # Have a local copy of ensemble for the distributed setting.
+      with ops.device(worker_device):
+        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+            name=name + '_local', is_local=True)
+      # TODO(soroush): Do partial updates if this becomes a bottleneck.
+      ensemble_reload = local_tree_ensemble.deserialize(
+          *tree_ensemble.serialize())
+
+    if training_state_cache:
+      cached_tree_ids, cached_node_ids, cached_logits = (
+          training_state_cache.lookup())
+    else:
+      # Always start from the beginning when no cache is set up.
+      batch_size = array_ops.shape(labels)[0]
+      cached_tree_ids, cached_node_ids, cached_logits = (
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
+          array_ops.zeros(
+              [batch_size, head.logits_dimension], dtype=dtypes.float32))
+
+    with ops.control_dependencies([ensemble_reload]):
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       last_layer_nodes_range) = local_tree_ensemble.get_states()
+      summary.scalar('ensemble/num_trees', num_trees)
+      summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
+      summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
+
+      partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
+          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension)
-    else:
-      if is_single_machine:
-        local_tree_ensemble = tree_ensemble
-        ensemble_reload = control_flow_ops.no_op()
-      else:
-        # Have a local copy of ensemble for the distributed setting.
-        with ops.device(worker_device):
-          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-              name=name + '_local', is_local=True)
-        # TODO(soroush): Do partial updates if this becomes a bottleneck.
-        ensemble_reload = local_tree_ensemble.deserialize(
-            *tree_ensemble.serialize())
-
-      if training_state_cache:
-        cached_tree_ids, cached_node_ids, cached_logits = (
-            training_state_cache.lookup())
-      else:
-        # Always start from the beginning when no cache is set up.
-        batch_size = array_ops.shape(labels)[0]
-        cached_tree_ids, cached_node_ids, cached_logits = (
-            array_ops.zeros([batch_size], dtype=dtypes.int32),
-            _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
-            array_ops.zeros(
-                [batch_size, head.logits_dimension], dtype=dtypes.float32))
-
-      with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
-         last_layer_nodes_range) = local_tree_ensemble.get_states()
-        summary.scalar('ensemble/num_trees', num_trees)
-        summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
-        summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
-
-        partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
-            tree_ensemble_handle=local_tree_ensemble.resource_handle,
-            cached_tree_ids=cached_tree_ids,
-            cached_node_ids=cached_node_ids,
-            bucketized_features=input_feature_list,
-            logits_dimension=head.logits_dimension)
-
       logits = cached_logits + partial_logits
 
     # Create training graph.
@@ -846,12 +855,11 @@
       labels=labels,
       train_op_fn=_train_op_fn,
       logits=logits)
-  if mode == model_fn.ModeKeys.TRAIN:
-    # Add an early stop hook.
-    estimator_spec = estimator_spec._replace(
-        training_hooks=estimator_spec.training_hooks +
-        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
-                             tree_hparams.n_trees, tree_hparams.max_depth),))
+  # Add an early stop hook.
+  estimator_spec = estimator_spec._replace(
+      training_hooks=estimator_spec.training_hooks +
+      (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                           tree_hparams.n_trees, tree_hparams.max_depth),))
   return estimator_spec
 
 
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index efa7812..4945c3b 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -388,7 +388,7 @@
         if a categorical column is multivalent.  One of "mean", "sqrtn", and
         "sum" -- these are effectively different ways to do example-level
         normalization, which can be useful for bag-of-words features.  For more
-        details, see @{tf.feature_column.linear_model$linear_model}.
+        details, see `tf.feature_column.linear_model`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -586,7 +586,7 @@
         if a categorical column is multivalent.  One of "mean", "sqrtn", and
         "sum" -- these are effectively different ways to do example-level
         normalization, which can be useful for bag-of-words features.  For more
-        details, see @{tf.feature_column.linear_model$linear_model}.
+        details, see `tf.feature_column.linear_model`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 58a7160..115dd18 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -306,7 +306,7 @@
         is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
         effectively different ways to do example-level normalization, which can
         be useful for bag-of-words features. for more details, see
-        @{tf.feature_column.linear_model$linear_model}.
+        `tf.feature_column.linear_model`.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -472,7 +472,7 @@
         is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
         effectively different ways to do example-level normalization, which can
         be useful for bag-of-words features. for more details, see
-        @{tf.feature_column.linear_model$linear_model}.
+        `tf.feature_column.linear_model`.
     """
     head = head_lib._regression_head(  # pylint: disable=protected-access
         label_dimension=label_dimension, weight_column=weight_column,
diff --git a/tensorflow/python/estimator/canned/prediction_keys.py b/tensorflow/python/estimator/canned/prediction_keys.py
index 16890ec..daa275b 100644
--- a/tensorflow/python/estimator/canned/prediction_keys.py
+++ b/tensorflow/python/estimator/canned/prediction_keys.py
@@ -32,3 +32,4 @@
   LOGITS = 'logits'
   PREDICTIONS = 'predictions'
   PROBABILITIES = 'probabilities'
+  TOP_K = 'top_k'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2fe44bc..2d0675d 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -50,9 +50,10 @@
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import device_setter
 from tensorflow.python.training import distribute as distribute_lib
@@ -85,14 +86,15 @@
   subdirectory thereof. If `model_dir` is not set, a temporary directory is
   used.
 
-  The `config` argument can be passed `RunConfig` object containing information
-  about the execution environment. It is passed on to the `model_fn`, if the
-  `model_fn` has a parameter named "config" (and input functions in the same
-  manner). If the `config` parameter is not passed, it is instantiated by the
-  `Estimator`. Not passing config means that defaults useful for local execution
-  are used. `Estimator` makes config available to the model (for instance, to
-  allow specialization based on the number of workers available), and also uses
-  some of its fields to control internals, especially regarding checkpointing.
+  The `config` argument can be passed `tf.estimator.RunConfig` object containing
+  information about the execution environment. It is passed on to the
+  `model_fn`, if the `model_fn` has a parameter named "config" (and input
+  functions in the same manner). If the `config` parameter is not passed, it is
+  instantiated by the `Estimator`. Not passing config means that defaults useful
+  for local execution are used. `Estimator` makes config available to the model
+  (for instance, to allow specialization based on the number of workers
+  available), and also uses some of its fields to control internals, especially
+  regarding checkpointing.
 
   The `params` argument contains hyperparameters. It is passed to the
   `model_fn`, if the `model_fn` has a parameter named "params", and to the input
@@ -118,7 +120,8 @@
                warm_start_from=None):
     """Constructs an `Estimator` instance.
 
-    See @{$estimators} for more information. To warm-start an `Estimator`:
+    See [estimators](https://tensorflow.org/guide/estimators) for more information.
+    To warm-start an `Estimator`:
 
     ```python
     estimator = tf.estimator.DNNClassifier(
@@ -128,7 +131,7 @@
     ```
 
     For more details on warm-start configuration, see
-    @{tf.estimator.WarmStartSettings$WarmStartSettings}.
+    `tf.estimator.WarmStartSettings`.
 
     Args:
       model_fn: Model function. Follows the signature:
@@ -137,15 +140,16 @@
 
           * `features`: This is the first item returned from the `input_fn`
                  passed to `train`, `evaluate`, and `predict`. This should be a
-                 single `Tensor` or `dict` of same.
+                 single `tf.Tensor` or `dict` of same.
           * `labels`: This is the second item returned from the `input_fn`
                  passed to `train`, `evaluate`, and `predict`. This should be a
-                 single `Tensor` or `dict` of same (for multi-head models). If
-                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
-                 the `model_fn`'s signature does not accept `mode`, the
-                 `model_fn` must still be able to handle `labels=None`.
+                 single `tf.Tensor` or `dict` of same (for multi-head models).
+                 If mode is @{tf.estimator.ModeKeys.PREDICT}, `labels=None` will
+                 be passed. If the `model_fn`'s signature does not accept
+                 `mode`, the `model_fn` must still be able to handle
+                 `labels=None`.
           * `mode`: Optional. Specifies if this training, evaluation or
-                 prediction. See `ModeKeys`.
+                 prediction. See `tf.estimator.ModeKeys`.
           * `params`: Optional `dict` of hyperparameters.  Will receive what
                  is passed to Estimator in `params` parameter. This allows
                  to configure Estimators from hyper parameter tuning.
@@ -155,10 +159,10 @@
                  configuration such as `num_ps_replicas`, or `model_dir`.
 
         * Returns:
-          `EstimatorSpec`
+          `tf.estimator.EstimatorSpec`
 
       model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
+        also be used to load checkpoints from the directory into an estimator to
         continue training a previously saved model. If `PathLike` object, the
         path will be resolved. If `None`, the model_dir in `config` will be used
         if set. If both are set, they must be same. If both are `None`, a
@@ -169,9 +173,10 @@
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
                        warm-start from, or a `tf.estimator.WarmStartSettings`
                        object to fully configure warm-starting.  If the string
-                       filepath is provided instead of a `WarmStartSettings`,
-                       then all variables are warm-started, and it is assumed
-                       that vocabularies and Tensor names are unchanged.
+                       filepath is provided instead of a
+                       `tf.estimator.WarmStartSettings`, then all variables are
+                       warm-started, and it is assumed that vocabularies
+                       and `tf.Tensor` names are unchanged.
 
     Raises:
       ValueError: parameters of `model_fn` don't match `params`.
@@ -219,10 +224,10 @@
 
   @property
   def model_fn(self):
-    """Returns the model_fn which is bound to self.params.
+    """Returns the `model_fn` which is bound to `self.params`.
 
     Returns:
-      The model_fn with following signature:
+      The `model_fn` with following signature:
         `def model_fn(features, labels, mode, config)`
     """
 
@@ -242,7 +247,7 @@
       Numpy array - value of the tensor.
 
     Raises:
-      ValueError: If the Estimator has not produced a checkpoint yet.
+      ValueError: If the `Estimator` has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
     with context.graph_mode():
@@ -255,14 +260,14 @@
       List of names.
 
     Raises:
-      ValueError: If the Estimator has not produced a checkpoint yet.
+      ValueError: If the `Estimator` has not produced a checkpoint yet.
     """
     _check_checkpoint_available(self.model_dir)
     with context.graph_mode():
       return [name for name, _ in training.list_variables(self.model_dir)]
 
   def latest_checkpoint(self):
-    """Finds the filename of latest saved checkpoint file in `model_dir`.
+    """Finds the filename of the latest saved checkpoint file in `model_dir`.
 
     Returns:
       The full path to the latest checkpoint or `None` if no checkpoint was
@@ -277,40 +282,38 @@
             steps=None,
             max_steps=None,
             saving_listeners=None):
-    """Trains a model given training data input_fn.
+    """Trains a model given training data `input_fn`.
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
-        the following:
-
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where `features` is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and `labels` is a
-            `Tensor` or a dictionary of string label name to `Tensor`. Both
-            `features` and `labels` are consumed by `model_fn`. They should
-            satisfy the expectation of `model_fn` from inputs.
-
-      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-        inside the training loop.
-      steps: Number of steps for which to train model. If `None`, train forever
-        or train until input_fn generates the `OutOfRange` error or
-        `StopIteration` exception. 'steps' works incrementally. If you call two
-        times train(steps=10) then training occurs in total 20 steps. If
-        `OutOfRange` or `StopIteration` occurs in the middle, training stops
+        See [Premade
+        Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
+        the following:  * A
+        `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
+        `(features, labels)` with same constraints as below. * A tuple
+        `(features, labels)`: Where `features` is a `tf.Tensor` or a dictionary
+        of string feature name to `Tensor` and `labels` is a `Tensor` or a
+        dictionary of string label name to `Tensor`. Both `features` and
+        `labels` are consumed by `model_fn`. They should satisfy the expectation
+        of `model_fn` from inputs.
+      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
+        callbacks inside the training loop.
+      steps: Number of steps for which to train the model. If `None`, train
+        forever or train until `input_fn` generates the `tf.errors.OutOfRange`
+        error or `StopIteration` exception. `steps` works incrementally. If you
+        call two times `train(steps=10)` then training occurs in total 20 steps.
+        If `OutOfRange` or `StopIteration` occurs in the middle, training stops
         before 20 steps. If you don't want to have incremental behavior please
         set `max_steps` instead. If set, `max_steps` must be `None`.
       max_steps: Number of total steps for which to train model. If `None`,
-        train forever or train until input_fn generates the `OutOfRange` error
-        or `StopIteration` exception. If set, `steps` must be `None`. If
-        `OutOfRange` or `StopIteration` occurs in the middle, training stops
-        before `max_steps` steps.
-        Two calls to `train(steps=100)` means 200 training
-        iterations. On the other hand, two calls to `train(max_steps=100)` means
-        that the second call will not do any iteration since first call did
-        all 100 steps.
+        train forever or train until `input_fn` generates the
+        `tf.errors.OutOfRange` error or `StopIteration` exception. If set,
+        `steps` must be `None`. If `OutOfRange` or `StopIteration` occurs in the
+        middle, training stops before `max_steps` steps. Two calls to
+        `train(steps=100)` means 200 training iterations. On the other hand, two
+        calls to `train(max_steps=100)` means that the second call will not do
+        any iteration since first call did all 100 steps.
       saving_listeners: list of `CheckpointSaverListener` objects. Used for
         callbacks that run immediately before or after checkpoint savings.
 
@@ -319,8 +322,16 @@
 
     Raises:
       ValueError: If both `steps` and `max_steps` are not `None`.
-      ValueError: If either `steps` or `max_steps` is <= 0.
+      ValueError: If either `steps` or `max_steps <= 0`.
     """
+    if self.config.task_type in (run_config.TaskType.EVALUATOR,
+                                 run_config.TaskType.PS):
+      raise ValueError(
+          'Train has been called wrong configuration. Please use '
+          'tf.estimator.train_and_evaluate which calls propper API according '
+          'to given configuration. Current configuration: {}.'.format(
+              self.config))
+
     with context.graph_mode():
       if (steps is not None) and (max_steps is not None):
         raise ValueError('Can not provide both steps and max_steps.')
@@ -345,13 +356,29 @@
       return self
 
   def _convert_train_steps_to_hooks(self, steps, max_steps):
+    """Create hooks to run correct number of steps in training.
+
+    Args:
+      steps: number of steps to run during training.
+      max_steps: maximum number of steps to be run during training. It'll be
+        the maximum number of steps the model will train to after restoring
+        from checkpoint even across multiple estimator.train calls.
+
+    Returns:
+      List of hooks to be passed to the estimator.
+    """
     if steps is not None or max_steps is not None:
+      if self._train_distribution:
+        steps_per_run = getattr(self._train_distribution, 'steps_per_run', 1)
+        if steps_per_run > 1:
+          return [basic_session_run_hooks._MultiStepStopAtStepHook(  # pylint: disable=protected-access
+              steps, max_steps, steps_per_run)]
       return [training.StopAtStepHook(steps, max_steps)]
     else:
       return []
 
   def eval_dir(self, name=None):
-    """Shows directory name where evaluation metrics are dumped.
+    """Shows the directory name where evaluation metrics are dumped.
 
     Args:
       name: Name of the evaluation if user needs to run multiple evaluations on
@@ -367,36 +394,35 @@
 
   def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
                name=None):
-    """Evaluates the model given evaluation data input_fn.
+    """Evaluates the model given evaluation data `input_fn`.
 
     For each step, calls `input_fn`, which returns one batch of data.
     Evaluates until:
     - `steps` batches are processed, or
-    - `input_fn` raises an end-of-input exception (`OutOfRangeError` or
+    - `input_fn` raises an end-of-input exception (`tf.errors.OutOfRangeError`
+    or
     `StopIteration`).
 
     Args:
-      input_fn: A function that constructs the input data for evaluation.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
-        the following:
-
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where `features` is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and `labels` is a
-            `Tensor` or a dictionary of string label name to `Tensor`. Both
-            `features` and `labels` are consumed by `model_fn`. They should
-            satisfy the expectation of `model_fn` from inputs.
-
+      input_fn: A function that constructs the input data for evaluation. See
+        [Premade Estimators](https://tensorflow.org/guide/premade#create_input_functions}
+        for more information. The
+        function should construct and return one of the following:  * A
+        `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
+        `(features, labels)` with same constraints as below. * A tuple
+        `(features, labels)`: Where `features` is a `tf.Tensor` or a dictionary
+        of string feature name to `Tensor` and `labels` is a `Tensor` or a
+        dictionary of string label name to `Tensor`. Both `features` and
+        `labels` are consumed by `model_fn`. They should satisfy the expectation
+        of `model_fn` from inputs.
       steps: Number of steps for which to evaluate model. If `None`, evaluates
         until `input_fn` raises an end-of-input exception.
-      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-        inside the evaluation call.
+      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
+        callbacks inside the evaluation call.
       checkpoint_path: Path of a specific checkpoint to evaluate. If `None`, the
         latest checkpoint in `model_dir` is used.  If there are no checkpoints
         in `model_dir`, evaluation is run with newly initialized `Variables`
-        instead of restored from checkpoint.
+        instead of ones restored from checkpoint.
       name: Name of the evaluation if user needs to run multiple evaluations on
         different data sets, such as on training data vs test data. Metrics for
         different evaluations are saved in separate folders, and appear
@@ -462,33 +488,34 @@
 
     Args:
       input_fn: A function that constructs the features. Prediction continues
-        until `input_fn` raises an end-of-input exception (`OutOfRangeError` or
-        `StopIteration`).
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        until `input_fn` raises an end-of-input exception
+        (`tf.errors.OutOfRangeError` or `StopIteration`).
+        See [Premade
+        Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
 
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must have
+          * A `tf.data.Dataset` object: Outputs of `Dataset` object must have
             same constraints as below.
-          * features: A `Tensor` or a dictionary of string feature name to
+          * features: A `tf.Tensor` or a dictionary of string feature name to
             `Tensor`. features are consumed by `model_fn`. They should satisfy
             the expectation of `model_fn` from inputs.
           * A tuple, in which case the first item is extracted as features.
 
       predict_keys: list of `str`, name of the keys to predict. It is used if
-        the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
-        then rest of the predictions will be filtered from the dictionary. If
-        `None`, returns all.
-      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-        inside the prediction call.
+        the `tf.estimator.EstimatorSpec.predictions` is a `dict`. If
+        `predict_keys` is used then rest of the predictions will be filtered
+        from the dictionary. If `None`, returns all.
+      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
+        callbacks inside the prediction call.
       checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
         latest checkpoint in `model_dir` is used.  If there are no checkpoints
         in `model_dir`, prediction is run with newly initialized `Variables`
-        instead of restored from checkpoint.
-      yield_single_examples: If False, yield the whole batch as returned by the
-        `model_fn` instead of decomposing the batch into individual elements.
-        This is useful if `model_fn` returns some tensors whose first dimension
-        is not equal to the batch size.
+        instead of ones restored from checkpoint.
+      yield_single_examples: If `False`, yields the whole batch as returned by
+        the `model_fn` instead of decomposing the batch into individual
+        elements. This is useful if `model_fn` returns some tensors whose first
+        dimension is not equal to the batch size.
 
     Yields:
       Evaluated values of `predictions` tensors.
@@ -496,10 +523,10 @@
     Raises:
       ValueError: Could not find a trained model in `model_dir`.
       ValueError: If batch length of predictions is not the same and
-        `yield_single_examples` is True.
+        `yield_single_examples` is `True`.
       ValueError: If there is a conflict between `predict_keys` and
         `predictions`. For example if `predict_keys` is not `None` but
-        `EstimatorSpec.predictions` is not a `dict`.
+        `tf.estimator.EstimatorSpec.predictions` is not a `dict`.
     """
     with context.graph_mode():
       hooks = _check_hooks_type(hooks)
@@ -554,14 +581,10 @@
       return
 
     allowed_overrides = set([
-        '_call_input_fn', '_call_model_fn',
-        '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
-        '_create_global_step', '_create_and_assert_global_step',
+        '_create_and_assert_global_step',
         '_tf_api_names', '_tf_api_names_v1', '_estimator_api_names',
         '_estimator_api_names_v1', '_estimator_api_constants',
         '_estimator_api_constants_v1',
-        '_validate_features_in_predict_input',
-        '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -582,30 +605,33 @@
       checkpoint_path=None,
       strip_default_attrs=False):
     # pylint: disable=line-too-long
-    """Exports inference graph as a SavedModel into given dir.
+    """Exports inference graph as a `SavedModel` into the given dir.
 
     For a detailed guide, see
-    @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+    [Using SavedModel with Estimators](https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
 
     This method builds a new graph by first calling the
-    serving_input_receiver_fn to obtain feature `Tensor`s, and then calling
-    this `Estimator`'s model_fn to generate the model graph based on those
+    `serving_input_receiver_fn` to obtain feature `Tensor`s, and then calling
+    this `Estimator`'s `model_fn` to generate the model graph based on those
     features. It restores the given checkpoint (or, lacking that, the most
     recent checkpoint) into this graph in a fresh session.  Finally it creates
-    a timestamped export directory below the given export_dir_base, and writes
-    a `SavedModel` into it containing a single `MetaGraphDef` saved from this
+    a timestamped export directory below the given `export_dir_base`, and writes
+    a `SavedModel` into it containing a single `tf.MetaGraphDef` saved from this
     session.
 
     The exported `MetaGraphDef` will provide one `SignatureDef` for each
-    element of the export_outputs dict returned from the model_fn, named using
+    element of the `export_outputs` dict returned from the `model_fn`, named
+    using
     the same keys.  One of these keys is always
-    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`,
+    indicating which
     signature will be served when a serving request does not specify one.
     For each signature, the outputs are provided by the corresponding
-    `ExportOutput`s, and the inputs are always the input receivers provided by
-    the serving_input_receiver_fn.
+    `tf.estimator.export.ExportOutput`s, and the inputs are always the input
+    receivers provided by
+    the `serving_input_receiver_fn`.
 
-    Extra assets may be written into the SavedModel via the assets_extra
+    Extra assets may be written into the `SavedModel` via the `assets_extra`
     argument.  This should be a dict, where each key gives a destination path
     (including the filename) relative to the assets.extra directory.  The
     corresponding value gives the full path of the source file to be copied.
@@ -614,23 +640,27 @@
 
     Args:
       export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported SavedModels.
-      serving_input_receiver_fn: A function that takes no argument and
-        returns a `ServingInputReceiver` or `TensorServingInputReceiver`.
+        timestamped subdirectories containing exported `SavedModel`s.
+      serving_input_receiver_fn: A function that takes no argument and returns a
+        `tf.estimator.export.ServingInputReceiver` or
+        `tf.estimator.export.TensorServingInputReceiver`.
       assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel, or `None` if no extra assets are needed.
-      as_text: whether to write the SavedModel proto in text format.
+        within the exported `SavedModel`, or `None` if no extra assets are
+        needed.
+      as_text: whether to write the `SavedModel` proto in text format.
       checkpoint_path: The checkpoint path to export.  If `None` (the default),
         the most recent checkpoint found within the model directory is chosen.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        removed from the `NodeDef`s. For a detailed guide, see [Stripping
+        Default-Valued
+        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       The string path to the exported directory.
 
     Raises:
-      ValueError: if no serving_input_receiver_fn is provided, no export_outputs
+      ValueError: if no `serving_input_receiver_fn` is provided, no
+      `export_outputs`
           are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
@@ -651,35 +681,37 @@
       strip_default_attrs=False,
       mode=model_fn_lib.ModeKeys.PREDICT):
     # pylint: disable=line-too-long
-    """Exports a single train/eval/predict graph as a SavedModel.
+    """Exports a single train/eval/predict graph as a `SavedModel`.
 
-    This method is a wrapper for _export_all_saved_models, and wraps a raw
-    input_receiver_fn in a dictionary to pass in to that function.
-    See _export_all_saved_models for full docs.
+    This method is a wrapper for `_export_all_saved_models`, and wraps a raw
+    `input_receiver_fn` in a dictionary to pass in to that function.
+    See `_export_all_saved_models` for full docs.
 
-    See tf.contrib.estimator.export_saved_model_for_mode for the currently
+    See `tf.contrib.estimator.export_saved_model_for_mode` for the currently
     exposed version of this function.
 
     Args:
       export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported SavedModels.
-      input_receiver_fn: a function that takes no argument and
-        returns the appropriate subclass of `InputReceiver`.
+        timestamped subdirectories containing exported `SavedModel`s.
+      input_receiver_fn: a function that takes no argument and returns the
+        appropriate subclass of `InputReceiver`.
       assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel, or `None` if no extra assets are needed.
-      as_text: whether to write the SavedModel proto in text format.
+        within the exported `SavedModel`, or `None` if no extra assets are
+        needed.
+      as_text: whether to write the `SavedModel` proto in text format.
       checkpoint_path: The checkpoint path to export.  If `None` (the default),
         the most recent checkpoint found within the model directory is chosen.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      mode: tf.estimator.ModeKeys value indicating with mode will be exported.
+        removed from the `NodeDef`s. For a detailed guide, see [Stripping
+        Default-Valued
+        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      mode: `tf.estimator.ModeKeys` value indicating with mode will be exported.
 
     Returns:
       The string path to the exported directory.
 
     Raises:
-      ValueError: if input_receiver_fn is None, no export_outputs
+      ValueError: if `input_receiver_fn` is `None`, no `export_outputs`
         are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
@@ -703,40 +735,46 @@
       checkpoint_path=None,
       strip_default_attrs=False):
     # pylint: disable=line-too-long
-    """Exports a SavedModel containing MetaGraphDefs for each requested mode.
+    """Exports a `SavedModel` containing `tf.MetaGraphDefs` for each requested mode.
 
-    See tf.contrib.estimator.export_all_saved_models for the currently
+    See `tf.contrib.estimator.export_all_saved_models` for the currently
     exposed version of this function.
 
-    For each mode passed in via the input_receiver_fn_map,
-    this method builds a new graph by calling the input_receiver_fn to obtain
+    For each mode passed in via the `input_receiver_fn_map`,
+    this method builds a new graph by calling the `input_receiver_fn` to obtain
     feature and label `Tensor`s. Next, this method calls the `Estimator`'s
-    model_fn in the passed mode to generate the model graph based on
+    `model_fn` in the passed mode to generate the model graph based on
     those features and labels, and restores the given checkpoint
     (or, lacking that, the most recent checkpoint) into the graph.
-    Only one of the modes is used for saving variables to the SavedModel
-    (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
-    MetaGraphDefs are saved with a single set of variables in a single
-    SavedModel directory.
+    Only one of the modes is used for saving variables to the `SavedModel`
+    (order of preference: @{tf.estimator.ModeKeys#TRAIN$TRAIN},
+    @{tf.estimator.ModeKeys#EVAL$EVAL}, then
+    @{tf.estimator.ModeKeys#PREDICT$PREDICT}), such that up to three
+    `tf.MetaGraphDefs` are saved with a single set of variables in a single
+    `SavedModel` directory.
 
-    For the variables and MetaGraphDefs, a timestamped export directory below
-    export_dir_base, and writes a `SavedModel` into it containing
-    the `MetaGraphDef` for the given mode and its associated signatures.
+    For the variables and `tf.MetaGraphDefs`, a timestamped export directory
+    below
+    `export_dir_base`, and writes a `SavedModel` into it containing
+    the `tf.MetaGraphDef` for the given mode and its associated signatures.
 
     For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
-    for each element of the export_outputs dict returned from the model_fn,
+    for each element of the `export_outputs` dict returned from the `model_fn`,
     named using the same keys.  One of these keys is always
-    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`,
+    indicating which
     signature will be served when a serving request does not specify one.
     For each signature, the outputs are provided by the corresponding
-    `ExportOutput`s, and the inputs are always the input receivers provided by
-    the serving_input_receiver_fn.
+    `tf.estimator.export.ExportOutput`s, and the inputs are always the input
+    receivers provided by
+    the `serving_input_receiver_fn`.
 
-    For training and evaluation, the train_op is stored in an extra collection,
-    and loss, metrics, and predictions are included in a SignatureDef for the
+    For training and evaluation, the `train_op` is stored in an extra
+    collection,
+    and loss, metrics, and predictions are included in a `SignatureDef` for the
     mode in question.
 
-    Extra assets may be written into the SavedModel via the assets_extra
+    Extra assets may be written into the `SavedModel` via the `assets_extra`
     argument.  This should be a dict, where each key gives a destination path
     (including the filename) relative to the assets.extra directory.  The
     corresponding value gives the full path of the source file to be copied.
@@ -745,25 +783,28 @@
 
     Args:
       export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported SavedModels.
-      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
-        mappings, where the input_receiver_fn is a function that takes no
-        argument and returns the appropriate subclass of `InputReceiver`.
+        timestamped subdirectories containing exported `SavedModel`s.
+      input_receiver_fn_map: dict of `tf.estimator.ModeKeys` to
+        `input_receiver_fn` mappings, where the `input_receiver_fn` is a
+        function that takes no arguments and returns the appropriate subclass of
+        `InputReceiver`.
       assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported SavedModel, or `None` if no extra assets are needed.
-      as_text: whether to write the SavedModel proto in text format.
+        within the exported `SavedModel`, or `None` if no extra assets are
+        needed.
+      as_text: whether to write the `SavedModel` proto in text format.
       checkpoint_path: The checkpoint path to export.  If `None` (the default),
         the most recent checkpoint found within the model directory is chosen.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        removed from the `NodeDef`s. For a detailed guide, see [Stripping
+        Default-Valued
+        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
-      A dict of tf.estimator.ModeKeys value to string path for each exported
+      A dict of `tf.estimator.ModeKeys` value to string path for each exported
       directory.
 
     Raises:
-      ValueError: if any input_receiver_fn is None, no export_outputs
+      ValueError: if any `input_receiver_fn` is `None`, no `export_outputs`
         are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
@@ -836,25 +877,29 @@
                                export_tags=None,
                                check_variables=True):
     # pylint: disable=line-too-long
-    """Loads variables and adds them along with a MetaGraphDef for saving.
+    """Loads variables and adds them along with a `tf.MetaGraphDef` for saving.
 
     Args:
-      builder: instance of SavedModelBuilder that will be used for saving.
-      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
-        mappings, where the input_receiver_fn is a function that takes no
-        argument and returns the appropriate subclass of `InputReceiver`.
+      builder: instance of `tf.saved_modle.builder.SavedModelBuilder` that will
+        be used for saving.
+      input_receiver_fn_map: dict of `tf.estimator.ModeKeys` to
+        `input_receiver_fn` mappings, where the `input_receiver_fn` is a
+        function that takes no argument and returns the appropriate subclass of
+        `InputReceiver`.
       checkpoint_path: The checkpoint path to export.  If `None` (the default),
         the most recent checkpoint found within the model directory is chosen.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      save_variables: bool, whether variables should be saved. If False, just
-        the MetaGraphDef will be saved. Note that save_variables should only be
-        True for the first call to this function, and the SavedModelBuilder will
-        raise an error if that is not the case.
-      mode: tf.estimator.ModeKeys value indicating which mode will be exported.
-      export_tags: The set of tags with which to save `MetaGraphDef`. If None,
-        a default set will be selected to matched the passed mode.
+        removed from the `NodeDef`s. For a detailed guide, see [Stripping
+        Default-Valued
+        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_variables: bool, whether variables should be saved. If `False`, just
+        the `tf.MetaGraphDef` will be saved. Note that `save_variables` should
+        only be `True` for the first call to this function, and the
+        `SavedModelBuilder` will raise an error if that is not the case.
+      mode: `tf.estimator.ModeKeys` value indicating which mode will be
+        exported.
+      export_tags: The set of tags with which to save `tf.MetaGraphDef`. If
+        `None`, a default set will be selected to matched the passed mode.
       check_variables: bool, whether to check the checkpoint has all variables.
 
     Raises:
@@ -936,21 +981,23 @@
           builder.add_meta_graph(**meta_graph_kwargs)
 
   def _get_export_outputs_for_spec(self, estimator_spec):
-    """Given an EstimatorSpec, determine what our export outputs should be.
+    """Given an `EstimatorSpec`, determine what our export outputs should be.
 
-    EstimatorSpecs contain export_outputs that are used for serving, but for
+    `EstimatorSpecs` contains `export_outputs` that are used for serving, but
+    for
     training and eval graphs, we must wrap the tensors of interest in
-    appropriate ExportOutput objects.
+    appropriate `tf.estimator.export.ExportOutput` objects.
 
     Args:
-      estimator_spec: EstimatorSpec object that will be exported.
+      estimator_spec: `tf.estimator.EstimatorSpec` object that will be exported.
 
     Returns:
-      a dict mapping export_output_name to ExportOutput object.
+      a dict mapping `export_output_name` to `tf.estimator.export.ExportOutput`
+      object.
 
     Raises:
-      ValueError: if an appropriate ExportOutput cannot be found for the
-        passed EstimatorSpec.mode
+      ValueError: if an appropriate `ExportOutput` cannot be found for the
+        passed `EstimatorSpec.mode`
     """
     mode = estimator_spec.mode
     if mode == model_fn_lib.ModeKeys.PREDICT:
@@ -988,7 +1035,7 @@
   def _get_features_and_labels_from_input_fn(self, input_fn, mode,
                                              distribution=None):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    if distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+    if distribution is not None:
       result = distribution.distribute_dataset(
           lambda: self._call_input_fn(input_fn, mode))
     else:
@@ -1027,13 +1074,13 @@
     """Creates the global step tensor in graph.
 
     The global step tensor must be an integer type with name 'global_step' and
-    be added to the collection @{tf.GraphKeys.GLOBAL_STEP}.
+    be added to the collection @{tf.GraphKeys#GLOBAL_STEP$GLOBAL_STEP}.
 
     Args:
       graph: The graph in which to create the global step tensor.
 
     Returns:
-      The global step `Tensor`.
+      The global step `tf.Tensor`.
     """
     return training.create_global_step(graph)
 
@@ -1044,7 +1091,7 @@
       graph: The graph in which to create the global step tensor.
 
     Returns:
-      The global step `Tensor`.
+      The global step `tf.Tensor`.
     """
     step = self._create_global_step(graph)
     assert step == training.get_global_step()
@@ -1056,21 +1103,21 @@
 
     Args:
       input_fn: The input function.
-      mode: ModeKeys
+      mode: `tf.estimator.ModeKeys`
 
     Returns:
-      The return value of the passed input_fn, which should be one of:
+      The return value of the passed `input_fn`, which should be one of:
 
         * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-        * A tuple (features, labels): Where `features` is a `Tensor` or a
+            tuple `(features, labels)` with same constraints as below.
+        * A tuple `(features, labels)`: Where `features` is a `Tensor` or a
           dictionary of string feature name to `Tensor` and `labels` is a
           `Tensor` or a dictionary of string label name to `Tensor`. Both
           `features` and `labels` are consumed by `model_fn`. They should
           satisfy the expectation of `model_fn` from inputs.
 
     Raises:
-      ValueError: if input_fn takes invalid arguments.
+      ValueError: if `input_fn` takes invalid arguments.
     """
     input_fn_args = function_utils.fn_args(input_fn)
     kwargs = {}
@@ -1089,14 +1136,14 @@
     Args:
       features: features dict.
       labels: labels dict.
-      mode: ModeKeys
-      config: RunConfig
+      mode: `tf.estimator.ModeKeys`
+      config: `tf.estimator.RunConfig`
 
     Returns:
-      An `EstimatorSpec` object.
+      An `tf.estimator.EstimatorSpec` object.
 
     Raises:
-      ValueError: if model_fn returns invalid objects.
+      ValueError: if `model_fn` returns invalid objects.
     """
     model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
@@ -1129,14 +1176,14 @@
       return self._train_model_default(input_fn, hooks, saving_listeners)
 
   def _train_model_default(self, input_fn, hooks, saving_listeners):
-    """Initiate training with input_fn, without DistributionStrategies.
+    """Initiate training with `input_fn`, without `DistributionStrategies`.
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-        inside the training loop.
-      saving_listeners: list of `CheckpointSaverListener` objects. Used for
-        callbacks that run immediately before or after checkpoint savings.
+      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
+        callbacks inside the training loop.
+      saving_listeners: list of `tf.train.CheckpointSaverListener` objects. Used
+        for callbacks that run immediately before or after checkpoint savings.
 
     Returns:
       Loss from training
@@ -1163,14 +1210,14 @@
                                              saving_listeners)
 
   def _train_model_distributed(self, input_fn, hooks, saving_listeners):
-    """Initiate training with input_fn, using DistributionStrategies.
+    """Initiate training with `input_fn`, using `DistributionStrategies`.
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-      hooks: List of `SessionRunHook` subclass instances. Used for callbacks
-        inside the training loop.
-      saving_listeners: list of `CheckpointSaverListener` objects. Used for
-        callbacks that run immediately before or after checkpoint savings.
+      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
+        callbacks inside the training loop.
+      saving_listeners: list of `tf.train.CheckpointSaverListener` objects. Used
+        for callbacks that run immediately before or after checkpoint savings.
 
     Returns:
       Loss from training
@@ -1184,6 +1231,10 @@
 
     worker_hooks = []
     with ops.Graph().as_default() as g:
+      # We want to create the iterations variable outside the distribution scope
+      # as that is just stored on the host and mainly used to drive the loop
+      # and doesn't need to be a Mirrored/Device variable.
+      steps_per_run_variable = training.get_or_create_steps_per_run_variable()
       with self._train_distribution.scope():
         random_seed.set_random_seed(self._config.tf_random_seed)
 
@@ -1206,28 +1257,29 @@
               self._train_distribution.read_var(global_step_tensor))
 
           # Create a step_fn from the train_op of grouped_estimator_spec
-          def step_fn(ctx, inputs):
+          def step_fn(ctx, features, labels):
             """A single step that is passed to run_on_dataset."""
-            features, labels = inputs
             estimator_spec = self._train_distribution.call_for_each_tower(
                 self._call_model_fn,
                 features,
                 labels,
                 model_fn_lib.ModeKeys.TRAIN,
                 self.config)
-            ctx.last_step_outputs = estimator_spec.loss
-            ctx.non_tensor_outputs = {'estimator_spec': estimator_spec}
-            with ops.control_dependencies([estimator_spec.train_op]):
-              return array_ops.identity(estimator_spec.loss)
+            ctx.set_last_step_output(
+                name='loss',
+                output=estimator_spec.loss,
+                aggregation=distribute_lib.get_loss_reduction())
+            ctx.set_non_tensor_output(
+                name='estimator_spec', output=estimator_spec)
+            return estimator_spec.train_op
 
           # Create new train_op post graph rewrites
-          # TODO(sourabhbajaj): Make sure train_steps and tpu_iterations
-          # work correctly. Currently hardcoded at 2
           initial_training_loss = constant_op.constant(1e7)
-          distributed_train_op, tpu_result, ctx = \
-              self._train_distribution._run_steps_on_dataset(  # pylint: disable=protected-access
-                  step_fn, iterator, iterations=2,
-                  initial_loop_values=initial_training_loss)
+          ctx = self._train_distribution.run_steps_on_dataset(
+              step_fn, iterator, iterations=steps_per_run_variable,
+              initial_loop_values={'loss': initial_training_loss})
+          distributed_train_op = ctx.run_op
+          tpu_result = ctx.last_step_outputs
           grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec']
         else:
           features, labels, input_hooks = (
@@ -1263,22 +1315,22 @@
 
         # TODO(sourabhbajaj): Merge the two code paths and clean up the code
         if is_tpu_strategy:
-          distributed_loss = tpu_result
+          loss = tpu_result['loss']
           worker_hooks.append(
               estimator_util.StrategyInitFinalizeHook(
-                  self._train_distribution.get_initialization_ops,
-                  self._train_distribution.get_finalize_ops))
+                  self._train_distribution.initialize,
+                  self._train_distribution.finalize))
         else:
-          distributed_loss = grouped_estimator_spec.loss
+          loss = self._train_distribution.unwrap(
+              self._train_distribution.reduce(
+                  distribute_lib.get_loss_reduction(),
+                  grouped_estimator_spec.loss,
+                  destinations='/device:CPU:0'))[0]
           distributed_train_op = grouped_estimator_spec.train_op
 
         estimator_spec = model_fn_lib.EstimatorSpec(
             mode=grouped_estimator_spec.mode,
-            loss=self._train_distribution.unwrap(
-                self._train_distribution.reduce(
-                    distribute_lib.get_loss_reduction(),
-                    distributed_loss,
-                    destinations='/device:CPU:0'))[0],
+            loss=loss,
             train_op=self._train_distribution.group(distributed_train_op),
             training_hooks=training_hooks,
             training_chief_hooks=training_chief_hooks,
@@ -1512,9 +1564,9 @@
           "`model_dir` are set both in constructor and `RunConfig`, but with "
           "different values. In constructor: '{}', in `RunConfig`: "
           "'{}' ".format(model_dir, config.model_dir))
-    if model_dir:
-      config = run_config.RunConfig.replace(config, model_dir=model_dir)
-  if getattr(config, 'model_dir', None) is None:
+  if model_dir:
+    config = run_config.RunConfig.replace(config, model_dir=model_dir)
+  elif getattr(config, 'model_dir', None) is None:
     model_dir = tempfile.mkdtemp()
     logging.warning('Using temporary folder as model directory: %s', model_dir)
     config = run_config.RunConfig.replace(config, model_dir=model_dir)
@@ -1523,7 +1575,7 @@
 
 
 def create_per_tower_ready_op(scaffold):
-  """Create a Scaffold.ready_op inside a tower."""
+  """Create a `tf.train.Scaffold.ready_op` inside a tower."""
   if scaffold.ready_op:
     return scaffold.ready_op
 
@@ -1538,7 +1590,7 @@
 
 
 def create_per_tower_ready_for_local_init_op(scaffold):
-  """Create a Scaffold.ready_for_local_init_op inside a tower."""
+  """Create a `tf.train.Scaffold.ready_for_local_init_op` inside a tower."""
   if scaffold.ready_for_local_init_op:
     return scaffold.ready_for_local_init_op
 
@@ -1636,7 +1688,7 @@
 
 
 def _check_hooks_type(hooks):
-  """Returns hooks if all are SessionRunHook, raises TypeError otherwise."""
+  """Returns hooks if all are `SessionRunHook`, raises TypeError otherwise."""
   hooks = list(hooks or [])
   for h in hooks:
     if not isinstance(h, training.SessionRunHook):
@@ -1656,17 +1708,18 @@
 
 
 def _get_replica_device_setter(config):
-  """Creates a replica device setter if required as a default device_fn.
+  """Creates a replica device setter if required as a default `device_fn`.
 
-  `Estimator` uses ReplicaDeviceSetter as a default device placer. It sets the
-  distributed related arguments such as number of ps_replicas based on given
-  config.
+  `Estimator` uses `tf.train.ReplicaDeviceSetter` as a default device placer. It
+  sets the
+  distributed related arguments such as number of `ps_replicas` based on given
+  `config`.
 
   Args:
-    config: A `RunConfig` instance.
+    config: A `tf.estimator.RunConfig` instance.
 
   Returns:
-    A replica device setter, or None.
+    A replica device setter, or `None`.
   """
   if config.task_type:
     worker_device = '/job:%s/task:%d' % (config.task_type, config.task_id)
@@ -1685,7 +1738,7 @@
 
 
 def _verify_model_fn_args(model_fn, params):
-  """Verifies model fn arguments."""
+  """Verifies `model_fn` arguments."""
   args = set(function_utils.fn_args(model_fn))
   if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
@@ -1783,10 +1836,24 @@
         logging.warn('Skipping summary for %s, cannot parse string to Summary.',
                      key)
         continue
+    elif isinstance(dictionary[key], np.ndarray):
+      value = summary_proto.value.add()
+      value.tag = key
+      value.node_name = key
+      tensor_proto = tensor_util.make_tensor_proto(dictionary[key])
+      value.tensor.CopyFrom(tensor_proto)
+      # pylint: disable=line-too-long
+      logging.info(
+          'Summary for np.ndarray is not visible in Tensorboard by default. '
+          'Consider using a Tensorboard plugin for visualization (see '
+          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
+          ' for more information).')
+      # pylint: enable=line-too-long
     else:
       logging.warn(
           'Skipping summary for %s, must be a float, np.float32, np.int64, '
-          'np.int32 or int or a serialized string of Summary.', key)
+          'np.int32 or int or np.ndarray or a serialized string of Summary.',
+          key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
 
@@ -1816,7 +1883,7 @@
 
 
 def _has_dataset_or_queue_runner(maybe_tensor):
-  """Returns True if TF dataset or QueueRunner has been used."""
+  """Returns `True` if `Dataset` or `QueueRunner` has been used."""
   # Check TF dataset first. Here, we use a simple algorithm to check the top
   # level Tensors only, which should be sufficient for most users.
   tensors = [x for x in nest.flatten(maybe_tensor) if isinstance(x, ops.Tensor)]
@@ -1839,9 +1906,9 @@
         'var_name_to_vocab_info',
         'var_name_to_prev_var_name',
     ])):
-  """Settings for warm-starting in Estimators.
+  """Settings for warm-starting in `tf.estimator.Estimators`.
 
-  Example Use with canned `DNNEstimator`:
+  Example Use with canned `tf.estimator.DNNEstimator`:
 
   ```
   emb_vocab_file = tf.feature_column.embedding_column(
@@ -1958,23 +2025,19 @@
     ckpt_to_initialize_from: [Required] A string specifying the directory with
       checkpoint file(s) or path to checkpoint from which to warm-start the
       model parameters.
-    vars_to_warm_start: [Optional] One of the following:
-
-      - A regular expression (string) that captures which variables to
-        warm-start (see tf.get_collection).  This expression will only consider
-        variables in the TRAINABLE_VARIABLES collection.
-      - A list of Variables to warm-start.
-      - A list of strings, each representing a full variable name to warm-start.
-      - `None`, in which case only variables specified in
-        `var_name_to_vocab_info` will be warm-started.
-
-      Defaults to `'.*'`, which warm-starts all variables in the
-      TRAINABLE_VARIABLES collection.  Note that this excludes variables such as
-      accumulators and moving statistics from batch norm.
+    vars_to_warm_start: [Optional] One of the following:  - A regular expression
+      (string) that captures which variables to warm-start (see
+      `tf.get_collection`).  This expression will only consider variables in the
+      `TRAINABLE_VARIABLES` collection. - A list of Variables to warm-start. - A
+      list of strings, each representing a full variable name to warm-start. -
+      `None`, in which case only variables specified in `var_name_to_vocab_info`
+      will be warm-started.  Defaults to `'.*'`, which warm-starts all variables
+      in the `TRAINABLE_VARIABLES` collection.  Note that this excludes
+      variables such as accumulators and moving statistics from batch norm.
     var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
-      VocabInfo. The variable names should be "full" variables, not the names
-      of the partitions.  If not explicitly provided, the variable is assumed to
-      have no vocabulary.
+      `tf.estimator.VocabInfo`. The variable names should be "full" variables,
+      not the names of the partitions.  If not explicitly provided, the variable
+      is assumed to have no vocabulary.
     var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
       name of the previously-trained variable in `ckpt_to_initialize_from`. If
       not explicitly provided, the name of the variable is assumed to be same
@@ -1999,43 +2062,45 @@
 
 
 def _get_saved_model_ckpt(saved_model_dir):
-  """Return path to variables checkpoint in a SavedModel directory."""
+  """Return path to variables checkpoint in a `SavedModel` directory."""
   if not gfile.Exists(
-      os.path.join(compat.as_bytes(saved_model_dir),
-                   compat.as_bytes('variables/variables.index'))):
+      os.path.join(saved_model_utils.get_variables_dir(saved_model_dir),
+                   compat.as_text('variables.index'))):
     raise ValueError('Directory provided has an invalid SavedModel format: %s'
                      % saved_model_dir)
-  return os.path.join(
-      compat.as_bytes(saved_model_dir),
-      compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
-                                     constants.VARIABLES_FILENAME)))
+  return saved_model_utils.get_variables_path(saved_model_dir)
 
 
 def _get_default_warm_start_settings(warm_start_from):
-  """Returns default WarmStartSettings.
+  """Returns default `tf.estimator.WarmStartSettings`.
 
   Args:
     warm_start_from: Either a string representing the filepath of a checkpoint
-      or SavedModel to initialize from, or an instance of WarmStartSettings.
+      or `SavedModel` to initialize from, or an instance of
+      `tf.estimator.WarmStartSettings`.
 
   Returns:
-    Either None or an instance of WarmStartSettings.
+    Either None or an instance of `WarmStartSettings`.
 
   Raises:
-    ValueError: If warm_start_from is not None but is neither a string nor an
-      instance of WarmStartSettings.
+    ValueError: If `warm_start_from` is not `None` but is neither a string nor
+    an
+      instance of `WarmStartSettings`.
   """
   if warm_start_from is None:
     return None
   if isinstance(warm_start_from, (six.string_types, six.binary_type)):
     # Infer that this is a SavedModel if export_path +
     # 'variables/variables.index' exists, and if so, construct the
-    # WarmStartSettings pointing to export_path + 'variables/variables'.
-    if gfile.Exists(os.path.join(compat.as_bytes(warm_start_from),
-                                 compat.as_bytes('variables/variables.index'))):
+    # WarmStartSettings pointing to the variables path
+    # (export_path + 'variables/variables').
+    if gfile.Exists(os.path.join(
+        saved_model_utils.get_variables_dir(warm_start_from),
+        compat.as_text('variables.index'))):
       logging.info('Warm-starting from a SavedModel')
       return WarmStartSettings(
-          ckpt_to_initialize_from=_get_saved_model_ckpt(warm_start_from))
+          ckpt_to_initialize_from=saved_model_utils.get_variables_path(
+              warm_start_from))
     return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
   elif isinstance(warm_start_from, WarmStartSettings):
     return warm_start_from
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index e855209..d316742 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -58,6 +58,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.random_ops import random_uniform
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -158,16 +159,7 @@
       def __init__(self):
         super(_Estimator, self).__init__(model_fn=dummy_model_fn)
 
-      def _call_input_fn(self, input_fn, mode):
-        return input_fn()
-
-      def _create_global_step(self, graph):
-        pass
-
-      def _convert_train_steps_to_hooks(self, steps, max_steps):
-        pass
-
-      def _convert_eval_steps_to_hooks(self, steps):
+      def _tf_api_names(self):
         pass
 
     _Estimator()
@@ -473,6 +465,29 @@
     est.train(InputFn(), steps=1)
     self.assertEqual(1, input_fn_call_count[0])
 
+  def test_nested_input_fn(self):
+    expected_params = {'batch_size': 10}
+
+    def _input_fn():
+      dataset_features = dataset_ops.Dataset.from_tensor_slices(
+          (random_uniform([4]),
+           random_uniform([4, 100], maxval=100, dtype=dtypes.int32)))
+      dataset_labels = dataset_ops.Dataset.from_tensor_slices(
+          random_uniform([4, 10]))
+      dataset = dataset_ops.Dataset.zip((dataset_features, dataset_labels))
+      dataset = dataset.repeat(-1)
+      iterator = dataset.make_initializable_iterator()
+      return iterator.get_next()
+
+    def _model_fn(features, labels, mode, params, config):
+      del params, config
+      return model_fn_global_step_incrementer(features, labels, mode)
+
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    est = estimator.Estimator(
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    est.train(_input_fn, steps=4)
+
   def test_input_fn_args(self):
     expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
@@ -940,6 +955,19 @@
     est = estimator.Estimator(model_fn=_model_fn)
     est.train(dummy_input_fn, steps=1)
 
+  def test_config_should_not_be_evaluator_or_ps(self):
+
+    class FakeEvaluatorConfig(run_config.RunConfig):
+
+      @property
+      def task_type(self):
+        return run_config.TaskType.EVALUATOR
+
+    est = estimator.Estimator(
+        model_fn=dummy_model_fn, config=FakeEvaluatorConfig())
+    with self.assertRaisesRegexp(ValueError, 'train_and_evaluate'):
+      est.train(dummy_input_fn, steps=1)
+
 
 def _model_fn_with_eval_metric_ops(features, labels, mode, params):
   _, _ = features, labels
@@ -1458,6 +1486,48 @@
     self.assertProtoEquals(expected_tensor_proto,
                            next(summaries).value[0].tensor)
 
+  def test_summary_writing_with_tensor(self):
+
+    def model_fn_with_prediction_mean_tensor_eval_metric_ops(
+        features, labels, mode, params):
+      _, _ = features, labels
+      global_step = training.get_global_step()
+
+      metric_name = params.get('metric_name') or 'metric'
+      predictions = constant_op.constant([1., .5, 0.])
+      eval_metric_ops = {metric_name: metrics_lib.mean_tensor(predictions)}
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(1.),
+          predictions={'predictions': predictions},
+          train_op=state_ops.assign_add(global_step, 1),
+          eval_metric_ops=eval_metric_ops)
+
+    metric_key = 'PMT'
+    params = {
+        'metric_name': metric_key,
+    }
+    est = estimator.Estimator(
+        model_fn=model_fn_with_prediction_mean_tensor_eval_metric_ops,
+        params=params,
+        config=run_config.RunConfig(save_summary_steps=1))
+    est.train(input_fn=dummy_input_fn, steps=10)
+    est.evaluate(
+        input_fn=dummy_input_fn,
+        steps=10,
+    )
+
+    writer_cache.FileWriterCache.clear()
+
+    self.assertTrue(
+        check_eventfile_for_keyword(metric_key, est.eval_dir()),
+        '{} should be part of reported summaries.'.format(metric_key))
+
+    summaries = summaries_with_matching_keyword(metric_key, est.eval_dir())
+    for value in next(summaries).value:
+      if value.tag == metric_key:
+        self.assertTrue(value.HasField('tensor'))
+
 
 class EstimatorPredictTest(test.TestCase):
 
@@ -2641,6 +2711,7 @@
       _, _ = features, labels
       my_int = variables.Variable(1, name='my_int',
                                   collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      _ = training.get_or_create_steps_per_run_variable()
       scores = constant_op.constant([3.])
       with ops.control_dependencies([
           variables.local_variables_initializer(),
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 529e7a8..3d171f7 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -288,9 +288,8 @@
 
 
 def _placeholder_from_tensor(t, default_batch_size=None):
-  shape_list = t.get_shape().as_list()
-  shape_list[0] = default_batch_size
-  shape = tensor_shape.TensorShape(shape_list)
+  batch_shape = tensor_shape.TensorShape([default_batch_size])
+  shape = batch_shape.concatenate(t.get_shape()[1:])
 
   # Reuse the feature tensor's op name (t.op.name) for the placeholder,
   # excluding the index from the tensor's name (t.name):
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index d2ac7f0..1d475ad 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -31,6 +31,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -378,6 +379,20 @@
     v = serving_input_receiver_fn()
     self.assertTrue(isinstance(v, export.ServingInputReceiver))
 
+  def test_build_raw_serving_input_receiver_fn_without_shape(self):
+    """Test case for issue #21178."""
+    f = {"feature_1": array_ops.placeholder(dtypes.float32),
+         "feature_2": array_ops.placeholder(dtypes.int32)}
+    serving_input_receiver_fn = export.build_raw_serving_input_receiver_fn(f)
+    v = serving_input_receiver_fn()
+    self.assertTrue(isinstance(v, export.ServingInputReceiver))
+    self.assertEqual(
+        tensor_shape.unknown_shape(),
+        v.receiver_tensors["feature_1"].shape)
+    self.assertEqual(
+        tensor_shape.unknown_shape(),
+        v.receiver_tensors["feature_2"].shape)
+
   def test_build_raw_serving_input_receiver_fn(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index c4b0069..fcccfbd 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -323,6 +323,43 @@
     self.assertTrue(gfile.Exists(export_dir_3))
     self.assertTrue(gfile.Exists(export_dir_4))
 
+  def test_garbage_collect_exports_with_trailing_delimiter(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(export_dir_base)
+    export_dir_1 = _create_test_export_dir(export_dir_base)
+    export_dir_2 = _create_test_export_dir(export_dir_base)
+    export_dir_3 = _create_test_export_dir(export_dir_base)
+    export_dir_4 = _create_test_export_dir(export_dir_base)
+
+    self.assertTrue(gfile.Exists(export_dir_1))
+    self.assertTrue(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+    def _serving_input_receiver_fn():
+      return array_ops.constant([1]), None
+
+    exporter = exporter_lib.LatestExporter(
+        name="latest_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        exports_to_keep=1)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    # Garbage collect all but the most recent 2 exports,
+    # where recency is determined based on the timestamp directory names.
+    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
+      mock_list_directory.return_value = [
+          os.path.basename(export_dir_1) + b"/",
+          os.path.basename(export_dir_2) + b"/",
+          os.path.basename(export_dir_3) + b"/",
+          os.path.basename(export_dir_4) + b"/",
+          ]
+      exporter.export(estimator, export_dir_base, None, None, False)
+
+    self.assertFalse(gfile.Exists(export_dir_1))
+    self.assertFalse(gfile.Exists(export_dir_2))
+    self.assertFalse(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
 
 def _create_test_export_dir(export_dir_base):
   export_dir = _get_timestamped_export_dir(export_dir_base)
diff --git a/tensorflow/python/estimator/gc.py b/tensorflow/python/estimator/gc.py
index 9f8a463..03ad33d 100644
--- a/tensorflow/python/estimator/gc.py
+++ b/tensorflow/python/estimator/gc.py
@@ -201,9 +201,11 @@
   raw_paths = gfile.ListDirectory(base_dir)
   paths = []
   for r in raw_paths:
-    p = parser(Path(os.path.join(compat.as_str_any(base_dir),
-                                 compat.as_str_any(r)),
-                    None))
+    # ListDirectory() return paths with "/" at the last if base_dir was GCS URL
+    r = compat.as_str_any(r)
+    if r[-1] == '/':
+      r = r[0:len(r)-1]
+    p = parser(Path(os.path.join(compat.as_str_any(base_dir), r), None))
     if p:
       paths.append(p)
   return sorted(paths)
diff --git a/tensorflow/python/estimator/gc_test.py b/tensorflow/python/estimator/gc_test.py
index 2cbdd51..53c3d4c 100644
--- a/tensorflow/python/estimator/gc_test.py
+++ b/tensorflow/python/estimator/gc_test.py
@@ -140,6 +140,17 @@
       gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
       gc._get_paths(base_dir, _create_parser(base_dir))
 
+  def testGcsDirWithSeparator(self):
+    base_dir = "gs://bucket/foo"
+    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
+      # gfile.ListDirectory returns directory names with separator '/'
+      mock_list_directory.return_value = ["0/", "1/"]
+      self.assertEqual(
+          gc._get_paths(base_dir, _create_parser(base_dir)),
+          [
+              gc.Path(os.path.join(base_dir, "0"), 0),
+              gc.Path(os.path.join(base_dir, "1"), 1)
+          ])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 81b201c..4e7b00b 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -19,9 +19,15 @@
 from __future__ import print_function
 
 import numpy as np
-
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _LinearModel
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import monitored_session
@@ -456,5 +462,159 @@
       self.assertAllEqual(res_arr[1], res_dict[1])
 
 
+class FeatureColumnIntegrationTest(test.TestCase):
+
+  def _initialized_session(self, config=None):
+    sess = session_lib.Session(config=config)
+    sess.run(variables_lib.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    return sess
+
+  def _get_linear_model_bias(self, name='linear_model'):
+    with variable_scope.variable_scope(name, reuse=True):
+      return variable_scope.get_variable('bias_weights')
+
+  def _get_linear_model_column_var(self, column, name='linear_model'):
+    return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                              name + '/' + column.name)[0]
+
+  def _get_keras_linear_model_predictions(
+      self,
+      features,
+      feature_columns,
+      units=1,
+      sparse_combiner='sum',
+      weight_collections=None,
+      trainable=True,
+      cols_to_vars=None):
+    keras_linear_model = _LinearModel(
+        feature_columns,
+        units,
+        sparse_combiner,
+        weight_collections,
+        trainable,
+        name='linear_model')
+    retval = keras_linear_model(features)  # pylint: disable=not-callable
+    if cols_to_vars is not None:
+      cols_to_vars.update(keras_linear_model.cols_to_vars())
+    return retval
+
+  def test_linear_model_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.linear_model(features, [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with self._initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = self._get_linear_model_bias()
+      price_buckets_var = self._get_linear_model_column_var(price_buckets)
+      body_style_var = self._get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_linear_model_impl_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = self._get_keras_linear_model_predictions(
+        features, [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with self._initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = self._get_linear_model_bias()
+      price_buckets_var = self._get_linear_model_column_var(price_buckets)
+      body_style_var = self._get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_functional_input_layer_with_numpy_input_fn(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    # one_hot_body_style has 3 dims in input_layer.
+    one_hot_body_style = fc.indicator_column(body_style)
+    # embedded_body_style has 5 dims in input_layer.
+    embedded_body_style = fc.embedding_column(body_style, dimension=5,
+                                              initializer=_initializer)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([11., 12., 13., 14.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_body_style])
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with self._initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
+          sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index c91204a..e2b8bfa 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -43,7 +43,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
@@ -361,7 +361,7 @@
     """model_fn for keras Estimator."""
     # Raise an error when users use DistributionStrategy with native Keras
     # optimizers. Currently we only support native TensorFlow optimizers.
-    if distribute_lib.has_distribution_strategy() and \
+    if distribution_strategy_context.has_distribution_strategy() and \
         not isinstance(keras_model.optimizer,
                        (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError('Only TensorFlow native optimizers are supported with '
@@ -373,7 +373,7 @@
     # We need to make sure that the output names of the last layer in the model
     # is the same for each of the cloned models. This is required for mirrored
     # strategy when we call regroup.
-    if distribute_lib.has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       for name in model.output_names:
         name = re.compile(r'_\d$').sub('', name)
         model_output_names.append(name)
@@ -396,7 +396,7 @@
       loss = model.total_loss
 
       if model.metrics:
-        # TODO(fchollet): support stateful metrics
+        # TODO(psv/fchollet): support stateful metrics
         eval_metric_ops = {}
         # When each metric maps to an output
         if isinstance(model.metrics, dict):
@@ -487,8 +487,9 @@
                        config=None):
   """Constructs an `Estimator` instance from given keras model.
 
-  For usage example, please see
-  @{$guide/estimators$creating_estimators_from_keras_models}.
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
 
   Args:
     keras_model: A compiled Keras model object. This argument is mutually
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index a01b230..5c04387 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -129,8 +129,8 @@
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Premade Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
@@ -193,8 +193,8 @@
 
     Args:
       input_fn: A function that constructs the input data for evaluation.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Premade Estimators](https://tensorflow.org/api_guides/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
@@ -323,6 +323,10 @@
 
   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
   ```
+  Note that in current implementation `estimator.evaluate` will be called
+  multiple times. This means that evaluation graph (including eval_input_fn)
+  will be re-created for each `evaluate` call. `estimator.train` will be called
+  only once.
 
   Example of distributed training:
 
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 8070703..1017d4b 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -122,7 +122,6 @@
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:numpy_io",
     ],
 )
 
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index d091d2f..2246d2f 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -16,7 +16,7 @@
 
 FeatureColumns provide a high level abstraction for ingesting and representing
 features. FeatureColumns are also the primary way of encoding features for
-canned @{tf.estimator.Estimator}s.
+canned `tf.estimator.Estimator`s.
 
 When using FeatureColumns with `Estimators`, the type of feature column you
 should choose depends on (1) the feature type and (2) the model type.
@@ -1936,7 +1936,7 @@
 
     It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
     dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
-    supported objects. Please check documentation of @{tf.parse_example} for all
+    supported objects. Please check documentation of `tf.parse_example` for all
     supported spec objects.
 
     Let's say a Feature column depends on raw feature ('raw') and another
@@ -1995,7 +1995,7 @@
       weight_collections: List of graph collections to which Variables (if any
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -2062,7 +2062,7 @@
   WARNING: Do not subclass this layer unless you know what you are doing:
   the API is subject to future changes.
 
-  A categorical feature typically handled with a @{tf.SparseTensor} of IDs.
+  A categorical feature typically handled with a `tf.SparseTensor` of IDs.
   """
   __metaclass__ = abc.ABCMeta
 
@@ -2097,7 +2097,7 @@
       weight_collections: List of graph collections to which variables (if any
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.get_variable}).
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.get_variable`).
     """
     pass
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 5bb47bf..6be930b 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -30,7 +30,6 @@
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
@@ -52,8 +51,6 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
 
 
 def _initialized_session(config=None):
@@ -1803,39 +1800,6 @@
                 features['price2']: [[1.], [5.]],
             })
 
-  def test_with_numpy_input_fn(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.linear_model(features, [price_buckets, body_style])
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
@@ -2458,45 +2422,6 @@
                 features['price2']: [[1.], [5.]],
             })
 
-  def test_with_numpy_input_fn(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = get_keras_linear_model_predictions(features,
-                                             [price_buckets, body_style])
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      bias = get_linear_model_bias()
-      price_buckets_var = get_linear_model_column_var(price_buckets)
-      body_style_var = get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -3043,51 +2968,6 @@
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
-  def test_with_numpy_input_fn(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in input_layer.
-    one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in input_layer.
-    embedded_body_style = fc.embedding_column(body_style, dimension=5,
-                                              initializer=_initializer)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([11., 12., 13., 14.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_body_style])
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
-          sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index b4dd23f..b6bf516 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -16,7 +16,7 @@
 
 FeatureColumns provide a high level abstraction for ingesting and representing
 features. FeatureColumns are also the primary way of encoding features for
-canned @{tf.estimator.Estimator}s.
+canned `tf.estimator.Estimator`s.
 
 When using FeatureColumns with `Estimators`, the type of feature column you
 should choose depends on (1) the feature type and (2) the model type.
@@ -1904,7 +1904,7 @@
 
     It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
     dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
-    supported objects. Please check documentation of @{tf.parse_example} for all
+    supported objects. Please check documentation of `tf.parse_example` for all
     supported spec objects.
 
     Let's say a Feature column depends on raw feature ('raw') and another
@@ -2025,7 +2025,7 @@
 class CategoricalColumn(FeatureColumn):
   """Represents a categorical feature.
 
-  A categorical feature typically handled with a @{tf.SparseTensor} of IDs.
+  A categorical feature typically handled with a `tf.SparseTensor` of IDs.
   """
   __metaclass__ = abc.ABCMeta
 
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index b3eb57d..eca34ac 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Operations that generate constants.
 
-See the @{$python/constant_op$constants guide}.
+See the [constants guide](https://tensorflow.org/api_guides/python/constant_op).
 """
 
 # Must be separate from array_ops to avoid a cyclic dependency.
@@ -145,6 +145,17 @@
                                                [-1. -1. -1.]]
   ```
 
+  `tf.constant` differs from `tf.fill` in a few ways:
+
+  *   `tf.constant` supports arbitrary constants, not just uniform scalar
+      Tensors like `tf.fill`.
+  *   `tf.constant` creates a `Const` node in the computation graph with the
+      exact value at graph construction time. On the other hand, `tf.fill`
+      creates an Op in the graph that is expanded at runtime.
+  *   Because `tf.constant` only embeds constant values in the graph, it does
+      not support dynamic shapes based on other runtime Tensors, whereas
+      `tf.fill` does.
+
   Args:
     value:          A constant value (or list) of output type `dtype`.
 
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 84106c3..9f973de 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -63,9 +63,9 @@
 
     *N.B.* If the failed op was synthesized at runtime, e.g. a `Send`
     or `Recv` op, there will be no corresponding
-    @{tf.Operation}
+    `tf.Operation`
     object.  In that case, this will return `None`, and you should
-    instead use the @{tf.OpError.node_def} to
+    instead use the `tf.OpError.node_def` to
     discover information about the op.
 
     Returns:
@@ -181,10 +181,10 @@
   """Raised when an operation or step is cancelled.
 
   For example, a long-running operation (e.g.
-  @{tf.QueueBase.enqueue} may be
+  `tf.QueueBase.enqueue` may be
   cancelled by running another operation (e.g.
-  @{tf.QueueBase.close},
-  or by @{tf.Session.close}.
+  `tf.QueueBase.close`,
+  or by `tf.Session.close`.
   A step that is running such a long-running operation will fail by raising
   `CancelledError`.
 
@@ -221,9 +221,9 @@
 
   This may occur, for example, if an operation is receives an input
   tensor that has an invalid value or shape. For example, the
-  @{tf.matmul} op will raise this
+  `tf.matmul` op will raise this
   error if it receives an input that is not a matrix, and the
-  @{tf.reshape} op will raise
+  `tf.reshape` op will raise
   this error if the new shape does not match the number of elements in the input
   tensor.
 
@@ -256,7 +256,7 @@
   """Raised when a requested entity (e.g., a file or directory) was not found.
 
   For example, running the
-  @{tf.WholeFileReader.read}
+  `tf.WholeFileReader.read`
   operation could raise `NotFoundError` if it receives the name of a file that
   does not exist.
 
@@ -273,7 +273,7 @@
   """Raised when an entity that we attempted to create already exists.
 
   For example, running an operation that saves a file
-  (e.g. @{tf.train.Saver.save})
+  (e.g. `tf.train.Saver.save`)
   could potentially raise this exception if an explicit filename for an
   existing file was passed.
 
@@ -291,7 +291,7 @@
   """Raised when the caller does not have permission to run an operation.
 
   For example, running the
-  @{tf.WholeFileReader.read}
+  `tf.WholeFileReader.read`
   operation could raise `PermissionDeniedError` if it receives the name of a
   file for which the user does not have the read file permission.
 
@@ -340,7 +340,7 @@
   """Operation was rejected because the system is not in a state to execute it.
 
   This exception is most commonly raised when running an operation
-  that reads a @{tf.Variable}
+  that reads a `tf.Variable`
   before it has been initialized.
 
   @@__init__
@@ -357,9 +357,9 @@
   """The operation was aborted, typically due to a concurrent action.
 
   For example, running a
-  @{tf.QueueBase.enqueue}
+  `tf.QueueBase.enqueue`
   operation may raise `AbortedError` if a
-  @{tf.QueueBase.close} operation
+  `tf.QueueBase.close` operation
   previously ran.
 
   @@__init__
@@ -375,9 +375,9 @@
   """Raised when an operation iterates past the valid input range.
 
   This exception is raised in "end-of-file" conditions, such as when a
-  @{tf.QueueBase.dequeue}
+  `tf.QueueBase.dequeue`
   operation is blocked on an empty queue, and a
-  @{tf.QueueBase.close}
+  `tf.QueueBase.close`
   operation executes.
 
   @@__init__
@@ -395,7 +395,7 @@
 
   Some operations may raise this error when passed otherwise-valid
   arguments that it does not currently support. For example, running
-  the @{tf.nn.max_pool} operation
+  the `tf.nn.max_pool` operation
   would raise this error if pooling was requested on the batch dimension,
   because this is not yet supported.
 
@@ -443,7 +443,7 @@
   """Raised when unrecoverable data loss or corruption is encountered.
 
   For example, this may be raised by running a
-  @{tf.WholeFileReader.read}
+  `tf.WholeFileReader.read`
   operation, if the file is truncated while it is being read.
 
   @@__init__
@@ -475,8 +475,8 @@
 
 c_api.PyExceptionRegistry_Init(_CODE_TO_EXCEPTION_CLASS)
 
-_EXCEPTION_CLASS_TO_CODE = dict((
-    (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
+_EXCEPTION_CLASS_TO_CODE = {
+    class_: code for code, class_ in _CODE_TO_EXCEPTION_CLASS.items()}
 
 
 @tf_export("errors.exception_type_from_error_code")
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 12bf03c..f47c0d8 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -665,7 +665,7 @@
   def container(self, container_name):
     """Returns a context manager that specifies the resource container to use.
 
-    Overridden from @{tf.Graph} to update both the init_scope container
+    Overridden from `tf.Graph` to update both the init_scope container
     and the present inner container. This is necessary to make sure setting
     containers applies correctly both to created variables and to stateful
     ops.
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 687bfeb..e48e67c 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -344,9 +344,9 @@
   This function provides a way to import a serialized TensorFlow
   [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto)
   protocol buffer, and extract individual objects in the `GraphDef` as
-  @{tf.Tensor} and @{tf.Operation} objects. Once extracted,
+  `tf.Tensor` and `tf.Operation` objects. Once extracted,
   these objects are placed into the current default `Graph`. See
-  @{tf.Graph.as_graph_def} for a way to create a `GraphDef`
+  `tf.Graph.as_graph_def` for a way to create a `GraphDef`
   proto.
 
   Args:
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index ed0bf1a..21eb306 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -229,7 +229,7 @@
   A `Tensor` is a symbolic handle to one of the outputs of an
   `Operation`. It does not hold the values of that operation's output,
   but instead provides a means of computing those values in a
-  TensorFlow @{tf.Session}.
+  TensorFlow `tf.Session`.
 
   This class has two primary purposes:
 
@@ -240,7 +240,7 @@
 
   2. After the graph has been launched in a session, the value of the
      `Tensor` can be computed by passing it to
-     @{tf.Session.run}.
+     `tf.Session.run`.
      `t.eval()` is a shortcut for calling
      `tf.get_default_session().run(t)`.
 
@@ -365,7 +365,7 @@
 
     The shape is computed using shape inference functions that are
     registered in the Op for each `Operation`.  See
-    @{tf.TensorShape}
+    `tf.TensorShape`
     for more details of what a shape represents.
 
     The inferred shape of a tensor is used to provide shape
@@ -695,7 +695,7 @@
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See @{tf.Session.run} for a
+        See `tf.Session.run` for a
         description of the valid feed values.
       session: (Optional.) The `Session` to be used to evaluate this tensor. If
         none, the default session will be used.
@@ -1455,10 +1455,10 @@
 
   The `IndexedSlices` class is used principally in the definition of
   gradients for operations that have sparse gradients
-  (e.g. @{tf.gather}).
+  (e.g. `tf.gather`).
 
   Contrast this representation with
-  @{tf.SparseTensor},
+  `tf.SparseTensor`,
   which uses multi-dimensional indices and scalar values.
   """
 
@@ -1619,8 +1619,8 @@
   more `Tensor` objects as input, and produces zero or more `Tensor`
   objects as output. Objects of type `Operation` are created by
   calling a Python op constructor (such as
-  @{tf.matmul})
-  or @{tf.Graph.create_op}.
+  `tf.matmul`)
+  or `tf.Graph.create_op`.
 
   For example `c = tf.matmul(a, b)` creates an `Operation` of type
   "MatMul" that takes tensors `a` and `b` as input, and produces `c`
@@ -1628,7 +1628,7 @@
 
   After the graph has been launched in a session, an `Operation` can
   be executed by passing it to
-  @{tf.Session.run}.
+  `tf.Session.run`.
   `op.run()` is a shortcut for calling `tf.get_default_session().run(op)`.
   """
 
@@ -2338,7 +2338,7 @@
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See @{tf.Session.run}
+        See `tf.Session.run`
         for a description of the valid feed values.
       session: (Optional.) The `Session` to be used to run to this operation. If
         none, the default session will be used.
@@ -2727,13 +2727,13 @@
   """A TensorFlow computation, represented as a dataflow graph.
 
   A `Graph` contains a set of
-  @{tf.Operation} objects,
+  `tf.Operation` objects,
   which represent units of computation; and
-  @{tf.Tensor} objects, which represent
+  `tf.Tensor` objects, which represent
   the units of data that flow between operations.
 
   A default `Graph` is always registered, and accessible by calling
-  @{tf.get_default_graph}.
+  `tf.get_default_graph`.
   To add an operation to the default graph, simply call one of the functions
   that defines a new `Operation`:
 
@@ -2743,7 +2743,7 @@
   ```
 
   Another typical usage involves the
-  @{tf.Graph.as_default}
+  `tf.Graph.as_default`
   context manager, which overrides the current default graph for the
   lifetime of the context:
 
@@ -2764,7 +2764,7 @@
   that are identified by name. For convenience when building a large
   graph, collections can store groups of related objects: for
   example, the `tf.Variable` uses a collection (named
-  @{tf.GraphKeys.GLOBAL_VARIABLES}) for
+  `tf.GraphKeys.GLOBAL_VARIABLES`) for
   all variables that are created during the construction of a graph. The caller
   may define additional collections by specifying a new name.
   """
@@ -2941,7 +2941,7 @@
     """Returns a version number that increases as ops are added to the graph.
 
     Note that this is unrelated to the
-    @{tf.Graph.graph_def_versions}.
+    `tf.Graph.graph_def_versions`.
 
     Returns:
        An integer version that increases as ops are added to the graph.
@@ -2991,7 +2991,7 @@
     After calling `g.finalize()`, no new operations can be added to
     `g`.  This method is used to ensure that no operations are added
     to a graph when it is shared between multiple threads, for example
-    when using a @{tf.train.QueueRunner}.
+    when using a `tf.train.QueueRunner`.
     """
     self._finalized = True
 
@@ -3040,7 +3040,7 @@
     """Returns a serialized `GraphDef` representation of this graph.
 
     The serialized `GraphDef` can be imported into another `Graph`
-    (using @{tf.import_graph_def}) or used with the
+    (using `tf.import_graph_def`) or used with the
     [C++ Session API](../../../../api_docs/cc/index.md).
 
     This method is thread-safe.
@@ -3086,7 +3086,7 @@
     """Returns a serialized `GraphDef` representation of this graph.
 
     The serialized `GraphDef` can be imported into another `Graph`
-    (using @{tf.import_graph_def}) or used with the
+    (using `tf.import_graph_def`) or used with the
     [C++ Session API](../../api_docs/cc/index.md).
 
     This method is thread-safe.
@@ -4860,6 +4860,18 @@
     else:
       self._graph_control_dependencies_stack = control_dependencies
 
+  @property
+  def _distribution_strategy_stack(self):
+    """A stack to maintain distribution strategy context for each thread."""
+    if not hasattr(self._thread_local, "_distribution_strategy_stack"):
+      self._thread_local._distribution_strategy_stack = []  # pylint: disable=protected-access
+    return self._thread_local._distribution_strategy_stack  # pylint: disable=protected-access
+
+  @_distribution_strategy_stack.setter
+  def _distribution_strategy_stack(self, _distribution_strategy_stack):
+    self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
+        _distribution_strategy_stack)
+
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -4884,7 +4896,7 @@
   """Wrapper for `Graph.device()` using the default graph.
 
   See
-  @{tf.Graph.device}
+  `tf.Graph.device`
   for more details.
 
   Args:
@@ -4950,7 +4962,7 @@
 def control_dependencies(control_inputs):
   """Wrapper for `Graph.control_dependencies()` using the default graph.
 
-  See @{tf.Graph.control_dependencies}
+  See `tf.Graph.control_dependencies`
   for more details.
 
   When eager execution is enabled, any callable object in the `control_inputs`
@@ -5316,7 +5328,7 @@
 
   Eager execution provides an imperative interface to TensorFlow. With eager
   execution enabled, TensorFlow functions execute operations immediately (as
-  opposed to adding to a graph to be executed later in a @{tf.Session}) and
+  opposed to adding to a graph to be executed later in a `tf.Session`) and
   return concrete values (as opposed to symbolic references to a node in a
   computational graph).
 
@@ -5336,9 +5348,9 @@
   both with and without eager execution).
 
   Args:
-    config: (Optional.) A @{tf.ConfigProto} to use to configure the environment
-      in which operations are executed. Note that @{tf.ConfigProto} is also
-      used to configure graph execution (via @{tf.Session}) and many options
+    config: (Optional.) A `tf.ConfigProto` to use to configure the environment
+      in which operations are executed. Note that `tf.ConfigProto` is also
+      used to configure graph execution (via `tf.Session`) and many options
       within `tf.ConfigProto` are not implemented (or are irrelevant) when
       eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
@@ -5638,7 +5650,7 @@
 
   * `GLOBAL_VARIABLES`: the default collection of `Variable` objects, shared
     across distributed environment (model variables are subset of these). See
-    @{tf.global_variables}
+    `tf.global_variables`
     for more details.
     Commonly, all `TRAINABLE_VARIABLES` variables will be in `MODEL_VARIABLES`,
     and all `MODEL_VARIABLES` variables will be in `GLOBAL_VARIABLES`.
@@ -5650,19 +5662,19 @@
     `tf.contrib.framework.model_variable` to add to this collection.
   * `TRAINABLE_VARIABLES`: the subset of `Variable` objects that will
     be trained by an optimizer. See
-    @{tf.trainable_variables}
+    `tf.trainable_variables`
     for more details.
   * `SUMMARIES`: the summary `Tensor` objects that have been created in the
     graph. See
-    @{tf.summary.merge_all}
+    `tf.summary.merge_all`
     for more details.
   * `QUEUE_RUNNERS`: the `QueueRunner` objects that are used to
     produce input for a computation. See
-    @{tf.train.start_queue_runners}
+    `tf.train.start_queue_runners`
     for more details.
   * `MOVING_AVERAGE_VARIABLES`: the subset of `Variable` objects that will also
     keep moving averages.  See
-    @{tf.moving_average_variables}
+    `tf.moving_average_variables`
     for more details.
   * `REGULARIZATION_LOSSES`: regularization losses collected during graph
     construction.
@@ -5772,11 +5784,43 @@
     return cls.GLOBAL_VARIABLES
 
 
+def dismantle_graph(graph):
+  """Cleans up reference cycles from a `Graph`.
+
+  Helpful for making sure the garbage collector doesn't need to run after a
+  temporary `Graph` is no longer needed.
+
+  Args:
+    graph: A `Graph` object to destroy. Neither it nor any of its ops are usable
+      after this function runs.
+  """
+  # pylint: disable=protected-access
+  # OrderedDict, constructed on Graph creation, makes a simple reference loop
+  # and hides it in an __attribute in some Python versions. We don't need to
+  # throw an error if we can't find it, but if we do find it we can break the
+  # loop to avoid creating work for the garbage collector.
+  graph_operations = graph.get_operations()
+  problematic_cycle = graph._functions.__dict__.get("_OrderedDict__root", None)
+  # pylint: enable=protected-access
+  if problematic_cycle:
+    try:
+      del problematic_cycle[0][:]
+    except TypeError:
+      # This is probably not one of the problematic Python versions. Continue
+      # with the rest of our cleanup.
+      pass
+  # Now clean up Operation<->Graph reference cycles by clearing all of the
+  # attributes for the Graph and its ops.
+  for op in graph_operations:
+    op.__dict__ = {}
+  graph.__dict__ = {}
+
+
 @tf_export("add_to_collection")
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
-  See @{tf.Graph.add_to_collection}
+  See `tf.Graph.add_to_collection`
   for more details.
 
   Args:
@@ -5795,7 +5839,7 @@
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
-  See @{tf.Graph.add_to_collections}
+  See `tf.Graph.add_to_collections`
   for more details.
 
   Args:
@@ -5815,7 +5859,7 @@
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
-  See @{tf.Graph.get_collection_ref}
+  See `tf.Graph.get_collection_ref`
   for more details.
 
   Args:
@@ -5839,7 +5883,7 @@
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
-  See @{tf.Graph.get_collection}
+  See `tf.Graph.get_collection`
   for more details.
 
   Args:
@@ -5882,7 +5926,7 @@
   This context manager validates that the given `values` are from the
   same graph, makes that graph the default graph, and pushes a
   name scope in that graph (see
-  @{tf.Graph.name_scope}
+  `tf.Graph.name_scope`
   for more details on that).
 
   For example, to define a new Python op called `my_op`:
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index b724432..2f95048 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -43,7 +43,7 @@
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  @{tf.set_random_seed}.
+  `tf.set_random_seed`.
 
   Args:
     op_seed: integer.
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 6a5c646..a455811 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -205,7 +205,7 @@
 
     Args:
       feed_dict: A dictionary that maps `Tensor` objects to feed values.
-        See @{tf.Session.run} for a
+        See `tf.Session.run` for a
         description of the valid feed values.
       session: (Optional.) The `Session` to be used to evaluate this sparse
         tensor. If none, the default session will be used.
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index c9be3d5..11b681d 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -498,9 +498,10 @@
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  `"Foo"`. See [Shape
+  functions](https://tensorflow.org/extend/adding_an_op#shape_functions_in_c)
   for details of shape functions and how to register them. Alternatively,
-  the shape may be set explicitly using @{tf.Tensor.set_shape}.
+  the shape may be set explicitly using `tf.Tensor.set_shape`.
   """
 
   def __init__(self, dims):
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 764e8bf..d2d1822 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -659,10 +659,10 @@
   """Execute the decorated test with and without enabling eager execution.
 
   This function returns a decorator intended to be applied to test methods in
-  a @{tf.test.TestCase} class. Doing so will cause the contents of the test
+  a `tf.test.TestCase` class. Doing so will cause the contents of the test
   method to be executed twice - once normally, and once with eager execution
   enabled. This allows unittests to confirm the equivalence between eager
-  and graph execution (see @{tf.enable_eager_execution}).
+  and graph execution (see `tf.enable_eager_execution`).
 
   For example, consider the following unittest:
 
@@ -736,15 +736,19 @@
         run_eagerly = assert_no_new_tensors(
             assert_no_garbage_created(run_eagerly))
 
-      with context.eager_mode():
+      if reset_test:
+        # This decorator runs the wrapped test twice.
+        # Reset the test environment between runs.
+        self.tearDown()
+        self._tempdir = None
+      # Create a new graph for the eagerly executed version of this test for
+      # better isolation.
+      graph_for_eager_test = ops.Graph()
+      with graph_for_eager_test.as_default(), context.eager_mode():
         if reset_test:
-          # This decorator runs the wrapped test twice.
-          # Reset the test environment between runs.
-          self.tearDown()
-          self._tempdir = None
           self.setUp()
-
         run_eagerly(self, **kwargs)
+      ops.dismantle_graph(graph_for_eager_test)
 
     return decorated
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index f983cbe..3a34dd9 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -40,6 +40,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -666,6 +667,22 @@
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
 
+# Its own test case to reproduce variable sharing issues which only pop up when
+# setUp() is overridden and super() is not called.
+class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    pass  # Intentionally does not call TensorFlowTestCase's super()
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_no_variable_sharing(self):
+    variable_scope.get_variable(
+        name="step_size",
+        initializer=np.array(1e-5, np.float32),
+        use_resource=True,
+        trainable=False)
+
+
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_reference_cycle_decorator(self):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1706158..fa1ec51 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -25,6 +25,7 @@
         "applications/inception_resnet_v2.py",
         "applications/inception_v3.py",
         "applications/mobilenet.py",
+        "applications/mobilenet_v2.py",
         "applications/nasnet.py",
         "applications/resnet50.py",
         "applications/vgg16.py",
@@ -295,109 +296,15 @@
 )
 
 py_test(
-    name = "densenet_test",
-    size = "large",
-    srcs = ["applications/densenet_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["nomsan"],  # times out, http://b/78650237
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "inception_resnet_v2_test",
-    size = "medium",
-    srcs = ["applications/inception_resnet_v2_test.py"],
+    name = "applications_test",
+    size = "enormous",
+    srcs = ["applications/applications_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "inception_v3_test",
-    size = "medium",
-    srcs = ["applications/inception_v3_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "mobilenet_test",
-    size = "medium",
-    srcs = ["applications/mobilenet_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "nasnet_test",
-    size = "large",
-    srcs = ["applications/nasnet_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["nomsan"],  # times out, http://b/78573625
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "resnet50_test",
-    size = "medium",
-    srcs = ["applications/resnet50_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "vgg16_test",
-    size = "small",
-    srcs = ["applications/vgg16_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "vgg19_test",
-    size = "small",
-    srcs = ["applications/vgg19_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "xception_test",
-    size = "medium",
-    srcs = ["applications/xception_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -492,7 +399,7 @@
 
 py_test(
     name = "local_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/local_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -718,14 +625,15 @@
 )
 
 py_test(
-    name = "imagenet_utils_test",
+    name = "conv_utils_test",
     size = "small",
-    srcs = ["applications/imagenet_utils_test.py"],
+    srcs = ["utils/conv_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -860,13 +768,14 @@
 
 py_test(
     name = "sequential_test",
-    size = "small",
+    size = "medium",
     srcs = ["engine/sequential_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index 0621352..cd9462d 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -13,17 +13,33 @@
 # limitations under the License.
 # ==============================================================================
 """Keras Applications are canned architectures with pre-trained weights."""
-
+# pylint: disable=g-import-not-at-top
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import keras_applications
+
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import engine
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import models
+from tensorflow.python.keras import utils
+
+keras_applications.set_keras_submodules(
+    backend=backend,
+    engine=engine,
+    layers=layers,
+    models=models,
+    utils=utils)
+
 from tensorflow.python.keras.applications.densenet import DenseNet121
 from tensorflow.python.keras.applications.densenet import DenseNet169
 from tensorflow.python.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
+# TODO(fchollet): enable MobileNetV2 in next version.
 from tensorflow.python.keras.applications.nasnet import NASNetLarge
 from tensorflow.python.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras.applications.resnet50 import ResNet50
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
new file mode 100644
index 0000000..ef3198a
--- /dev/null
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration tests for Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.keras import applications
+from tensorflow.python.platform import test
+
+
+MODEL_LIST = [
+    (applications.ResNet50, 2048),
+    (applications.VGG16, 512),
+    (applications.VGG19, 512),
+    (applications.Xception, 2048),
+    (applications.InceptionV3, 2048),
+    (applications.InceptionResNetV2, 1536),
+    (applications.MobileNet, 1024),
+    # TODO(fchollet): enable MobileNetV2 in next version.
+    (applications.DenseNet121, 1024),
+    (applications.DenseNet169, 1664),
+    (applications.DenseNet201, 1920),
+    (applications.NASNetMobile, 1056),
+    (applications.NASNetLarge, 4032),
+]
+
+
+class ApplicationsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(*MODEL_LIST)
+  def test_classification_model(self, model_fn, _):
+    model = model_fn(classes=1000, weights=None)
+    self.assertEqual(model.output_shape[-1], 1000)
+
+  @parameterized.parameters(*MODEL_LIST)
+  def test_feature_extration_model(self, model_fn, output_dim):
+    model = model_fn(include_top=False, weights=None)
+    self.assertEqual(model.output_shape, (None, None, None, output_dim))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 8df6d08..fbdcc66 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -13,342 +13,25 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """DenseNet models for Keras.
-
-# Reference paper
-
-- [Densely Connected Convolutional Networks]
-  (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications import imagenet_utils
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import AveragePooling2D
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Concatenate
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.layers import ZeroPadding2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
+from keras_applications import densenet
 from tensorflow.python.util.tf_export import tf_export
 
+DenseNet121 = densenet.DenseNet121
+DenseNet169 = densenet.DenseNet169
+DenseNet201 = densenet.DenseNet201
+decode_predictions = densenet.decode_predictions
+preprocess_input = densenet.preprocess_input
 
-DENSENET121_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels.h5'
-DENSENET121_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
-DENSENET169_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels.h5'
-DENSENET169_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5'
-DENSENET201_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels.h5'
-DENSENET201_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-def dense_block(x, blocks, name):
-  """A dense block.
-
-  Arguments:
-      x: input tensor.
-      blocks: integer, the number of building blocks.
-      name: string, block label.
-
-  Returns:
-      output tensor for the block.
-  """
-  for i in range(blocks):
-    x = conv_block(x, 32, name=name + '_block' + str(i + 1))
-  return x
-
-
-def transition_block(x, reduction, name):
-  """A transition block.
-
-  Arguments:
-      x: input tensor.
-      reduction: float, compression rate at transition layers.
-      name: string, block label.
-
-  Returns:
-      output tensor for the block.
-  """
-  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
-  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name + '_bn')(x)
-  x = Activation('relu', name=name + '_relu')(x)
-  x = Conv2D(
-      int(K.int_shape(x)[bn_axis] * reduction),
-      1,
-      use_bias=False,
-      name=name + '_conv')(
-          x)
-  x = AveragePooling2D(2, strides=2, name=name + '_pool')(x)
-  return x
-
-
-def conv_block(x, growth_rate, name):
-  """A building block for a dense block.
-
-  Arguments:
-      x: input tensor.
-      growth_rate: float, growth rate at dense layers.
-      name: string, block label.
-
-  Returns:
-      output tensor for the block.
-  """
-  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
-  x1 = BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(
-          x)
-  x1 = Activation('relu', name=name + '_0_relu')(x1)
-  x1 = Conv2D(4 * growth_rate, 1, use_bias=False, name=name + '_1_conv')(x1)
-  x1 = BatchNormalization(
-      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
-          x1)
-  x1 = Activation('relu', name=name + '_1_relu')(x1)
-  x1 = Conv2D(
-      growth_rate, 3, padding='same', use_bias=False, name=name + '_2_conv')(
-          x1)
-  x = Concatenate(axis=bn_axis, name=name + '_concat')([x, x1])
-  return x
-
-
-def DenseNet(blocks,
-             include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000):
-  """Instantiates the DenseNet architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. Note that when using TensorFlow,
-  for best performance you should set
-  `image_data_format='channels_last'` in your Keras config
-  at ~/.keras/keras.json.
-
-  The model and the weights are compatible with
-  TensorFlow, Theano, and CNTK. The data format
-  convention used by the model is the one
-  specified in your Keras config file.
-
-  Arguments:
-      blocks: numbers of building blocks for the four dense layers.
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 inputs channels.
-      pooling: optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=221,
-      data_format=K.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
-
-  x = ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
-  x = Conv2D(64, 7, strides=2, use_bias=False, name='conv1/conv')(x)
-  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='conv1/bn')(x)
-  x = Activation('relu', name='conv1/relu')(x)
-  x = ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
-  x = MaxPooling2D(3, strides=2, name='pool1')(x)
-
-  x = dense_block(x, blocks[0], name='conv2')
-  x = transition_block(x, 0.5, name='pool2')
-  x = dense_block(x, blocks[1], name='conv3')
-  x = transition_block(x, 0.5, name='pool3')
-  x = dense_block(x, blocks[2], name='conv4')
-  x = transition_block(x, 0.5, name='pool4')
-  x = dense_block(x, blocks[3], name='conv5')
-
-  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='bn')(x)
-
-  if include_top:
-    x = GlobalAveragePooling2D(name='avg_pool')(x)
-    x = Dense(classes, activation='softmax', name='fc1000')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D(name='avg_pool')(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D(name='max_pool')(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  if blocks == [6, 12, 24, 16]:
-    model = Model(inputs, x, name='densenet121')
-  elif blocks == [6, 12, 32, 32]:
-    model = Model(inputs, x, name='densenet169')
-  elif blocks == [6, 12, 48, 32]:
-    model = Model(inputs, x, name='densenet201')
-  else:
-    model = Model(inputs, x, name='densenet')
-
-  # Load weights.
-  if weights == 'imagenet':
-    if include_top:
-      if blocks == [6, 12, 24, 16]:
-        weights_path = get_file(
-            'densenet121_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET121_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='0962ca643bae20f9b6771cb844dca3b0')
-      elif blocks == [6, 12, 32, 32]:
-        weights_path = get_file(
-            'densenet169_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET169_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='bcf9965cf5064a5f9eb6d7dc69386f43')
-      elif blocks == [6, 12, 48, 32]:
-        weights_path = get_file(
-            'densenet201_weights_tf_dim_ordering_tf_kernels.h5',
-            DENSENET201_WEIGHT_PATH,
-            cache_subdir='models',
-            file_hash='7bb75edd58cb43163be7e0005fbe95ef')
-    else:
-      if blocks == [6, 12, 24, 16]:
-        weights_path = get_file(
-            'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET121_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='4912a53fbd2a69346e7f2c0b5ec8c6d3')
-      elif blocks == [6, 12, 32, 32]:
-        weights_path = get_file(
-            'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET169_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='50662582284e4cf834ce40ab4dfa58c6')
-      elif blocks == [6, 12, 48, 32]:
-        weights_path = get_file(
-            'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5',
-            DENSENET201_WEIGHT_PATH_NO_TOP,
-            cache_subdir='models',
-            file_hash='1c2de60ee40562448dbac34a0737e798')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@tf_export('keras.applications.DenseNet121',
-           'keras.applications.densenet.DenseNet121')
-def DenseNet121(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
-  return DenseNet([6, 12, 24, 16], include_top, weights, input_tensor,
-                  input_shape, pooling, classes)
-
-
-@tf_export('keras.applications.DenseNet169',
-           'keras.applications.densenet.DenseNet169')
-def DenseNet169(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
-  return DenseNet([6, 12, 32, 32], include_top, weights, input_tensor,
-                  input_shape, pooling, classes)
-
-
-@tf_export('keras.applications.DenseNet201',
-           'keras.applications.densenet.DenseNet201')
-def DenseNet201(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
-  return DenseNet([6, 12, 48, 32], include_top, weights, input_tensor,
-                  input_shape, pooling, classes)
-
-
-@tf_export('keras.applications.densenet.preprocess_input')
-def preprocess_input(x, data_format=None):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments:
-      x: a 3D or 4D numpy array consists of RGB values within [0, 255].
-      data_format: data format of the image tensor.
-
-  Returns:
-      Preprocessed array.
-  """
-  return imagenet_utils.preprocess_input(x, data_format, mode='torch')
-
-
-setattr(DenseNet121, '__doc__', DenseNet.__doc__)
-setattr(DenseNet169, '__doc__', DenseNet.__doc__)
-setattr(DenseNet201, '__doc__', DenseNet.__doc__)
+tf_export('keras.applications.densenet.DenseNet121',
+          'keras.applications.DenseNet121')(DenseNet121)
+tf_export('keras.applications.densenet.DenseNet169',
+          'keras.applications.DenseNet169')(DenseNet169)
+tf_export('keras.applications.densenet.DenseNet201',
+          'keras.applications.DenseNet201')(DenseNet201)
+tf_export('keras.applications.densenet.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/densenet_test.py b/tensorflow/python/keras/applications/densenet_test.py
deleted file mode 100644
index 8b6aa28..0000000
--- a/tensorflow/python/keras/applications/densenet_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for DenseNet application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class DenseNet121Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.DenseNet121(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.DenseNet121(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1024))
-
-  def test_with_pooling(self):
-    model = keras.applications.DenseNet121(weights=None,
-                                           include_top=False,
-                                           pooling='avg')
-    self.assertEqual(model.output_shape, (None, 1024))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet121(weights='unknown',
-                                     include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet121(weights='imagenet',
-                                     classes=2000)
-
-
-class DenseNet169Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.DenseNet169(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.DenseNet169(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1664))
-
-  def test_with_pooling(self):
-    model = keras.applications.DenseNet169(weights=None,
-                                           include_top=False,
-                                           pooling='max')
-    self.assertEqual(model.output_shape, (None, 1664))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet169(weights='unknown',
-                                     include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet169(weights='imagenet',
-                                     classes=2000)
-
-
-class DenseNet201(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.DenseNet201(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.DenseNet201(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1920))
-
-  def test_with_pooling(self):
-    model = keras.applications.DenseNet201(weights=None,
-                                           include_top=False,
-                                           pooling='avg')
-    self.assertEqual(model.output_shape, (None, 1920))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet201(weights='unknown',
-                                     include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.DenseNet201(weights='imagenet',
-                                     classes=2000)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 0d8ccca..70f8f6f 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -18,322 +18,28 @@
 from __future__ import division
 from __future__ import print_function
 
-import json
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import imagenet_utils
 from tensorflow.python.util.tf_export import tf_export
 
+decode_predictions = imagenet_utils.decode_predictions
+preprocess_input = imagenet_utils.preprocess_input
 
-CLASS_INDEX = None
-CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
-
-# Global tensor of imagenet mean for preprocessing symbolic inputs
-_IMAGENET_MEAN = None
-
-
-def _preprocess_numpy_input(x, data_format, mode):
-  """Preprocesses a Numpy array encoding a batch of images.
-
-  Arguments:
-      x: Input array, 3D or 4D.
-      data_format: Data format of the image array.
-      mode: One of "caffe", "tf" or "torch".
-          - caffe: will convert the images from RGB to BGR,
-              then will zero-center each color channel with
-              respect to the ImageNet dataset,
-              without scaling.
-          - tf: will scale pixels between -1 and 1,
-              sample-wise.
-          - torch: will scale pixels between 0 and 1 and then
-              will normalize each channel with respect to the
-              ImageNet dataset.
-
-  Returns:
-      Preprocessed Numpy array.
-  """
-  if mode == 'tf':
-    x /= 127.5
-    x -= 1.
-    return x
-
-  if mode == 'torch':
-    x /= 255.
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-  else:
-    if data_format == 'channels_first':
-      # 'RGB'->'BGR'
-      if x.ndim == 3:
-        x = x[::-1, ...]
-      else:
-        x = x[:, ::-1, ...]
-    else:
-      # 'RGB'->'BGR'
-      x = x[..., ::-1]
-    mean = [103.939, 116.779, 123.68]
-    std = None
-
-  # Zero-center by mean pixel
-  if data_format == 'channels_first':
-    if x.ndim == 3:
-      x[0, :, :] -= mean[0]
-      x[1, :, :] -= mean[1]
-      x[2, :, :] -= mean[2]
-      if std is not None:
-        x[0, :, :] /= std[0]
-        x[1, :, :] /= std[1]
-        x[2, :, :] /= std[2]
-    else:
-      x[:, 0, :, :] -= mean[0]
-      x[:, 1, :, :] -= mean[1]
-      x[:, 2, :, :] -= mean[2]
-      if std is not None:
-        x[:, 0, :, :] /= std[0]
-        x[:, 1, :, :] /= std[1]
-        x[:, 2, :, :] /= std[2]
-  else:
-    x[..., 0] -= mean[0]
-    x[..., 1] -= mean[1]
-    x[..., 2] -= mean[2]
-    if std is not None:
-      x[..., 0] /= std[0]
-      x[..., 1] /= std[1]
-      x[..., 2] /= std[2]
-  return x
-
-
-def _preprocess_symbolic_input(x, data_format, mode):
-  """Preprocesses a tensor encoding a batch of images.
-
-  Arguments:
-      x: Input tensor, 3D or 4D.
-      data_format: Data format of the image tensor.
-      mode: One of "caffe", "tf" or "torch".
-          - caffe: will convert the images from RGB to BGR,
-              then will zero-center each color channel with
-              respect to the ImageNet dataset,
-              without scaling.
-          - tf: will scale pixels between -1 and 1,
-              sample-wise.
-          - torch: will scale pixels between 0 and 1 and then
-              will normalize each channel with respect to the
-              ImageNet dataset.
-
-  Returns:
-      Preprocessed tensor.
-  """
-  global _IMAGENET_MEAN
-
-  if mode == 'tf':
-    x /= 127.5
-    x -= 1.
-    return x
-
-  if mode == 'torch':
-    x /= 255.
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-  else:
-    if data_format == 'channels_first':
-      # 'RGB'->'BGR'
-      if K.ndim(x) == 3:
-        x = x[::-1, ...]
-      else:
-        x = x[:, ::-1, ...]
-    else:
-      # 'RGB'->'BGR'
-      x = x[..., ::-1]
-    mean = [103.939, 116.779, 123.68]
-    std = None
-
-  if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = constant_op.constant(-np.array(mean), dtype=K.floatx())
-
-  # Zero-center by mean pixel
-  if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-    x = K.bias_add(x, math_ops.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
-  else:
-    x = K.bias_add(x, _IMAGENET_MEAN, data_format)
-  if std is not None:
-    x /= std
-  return x
-
-
-@tf_export('keras.applications.resnet50.preprocess_input',
-           'keras.applications.vgg19.preprocess_input',
-           'keras.applications.vgg16.preprocess_input')
-def preprocess_input(x, data_format=None, mode='caffe'):
-  """Preprocesses a tensor or Numpy array encoding a batch of images.
-
-  Arguments:
-      x: Input Numpy or symbolic tensor, 3D or 4D.
-      data_format: Data format of the image tensor/array.
-      mode: One of "caffe", "tf".
-          - caffe: will convert the images from RGB to BGR,
-              then will zero-center each color channel with
-              respect to the ImageNet dataset,
-              without scaling.
-          - tf: will scale pixels between -1 and 1,
-              sample-wise.
-
-  Returns:
-      Preprocessed tensor or Numpy array.
-
-  Raises:
-      ValueError: In case of unknown `data_format` argument.
-  """
-  if data_format is None:
-    data_format = K.image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format ' + str(data_format))
-
-  if isinstance(x, np.ndarray):
-    return _preprocess_numpy_input(x, data_format=data_format, mode=mode)
-  else:
-    return _preprocess_symbolic_input(x, data_format=data_format, mode=mode)
-
-
-@tf_export('keras.applications.nasnet.decode_predictions',
-           'keras.applications.resnet50.decode_predictions',
-           'keras.applications.vgg19.decode_predictions',
-           'keras.applications.vgg16.decode_predictions',
-           'keras.applications.inception_resnet_v2.decode_predictions',
-           'keras.applications.inception_v3.decode_predictions',
-           'keras.applications.densenet.decode_predictions',
-           'keras.applications.mobilenet.decode_predictions',
-           'keras.applications.xception.decode_predictions')
-def decode_predictions(preds, top=5):
-  """Decodes the prediction of an ImageNet model.
-
-  Arguments:
-      preds: Numpy tensor encoding a batch of predictions.
-      top: Integer, how many top-guesses to return.
-
-  Returns:
-      A list of lists of top class prediction tuples
-      `(class_name, class_description, score)`.
-      One list of tuples per sample in batch input.
-
-  Raises:
-      ValueError: In case of invalid shape of the `pred` array
-          (must be 2D).
-  """
-  global CLASS_INDEX
-  if len(preds.shape) != 2 or preds.shape[1] != 1000:
-    raise ValueError('`decode_predictions` expects '
-                     'a batch of predictions '
-                     '(i.e. a 2D array of shape (samples, 1000)). '
-                     'Found array with shape: ' + str(preds.shape))
-  if CLASS_INDEX is None:
-    fpath = get_file(
-        'imagenet_class_index.json',
-        CLASS_INDEX_PATH,
-        cache_subdir='models',
-        file_hash='c2c37ea517e94d9795004a39431a14cb')
-    with open(fpath) as f:
-      CLASS_INDEX = json.load(f)
-  results = []
-  for pred in preds:
-    top_indices = pred.argsort()[-top:][::-1]
-    result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
-    result.sort(key=lambda x: x[2], reverse=True)
-    results.append(result)
-  return results
-
-
-def _obtain_input_shape(input_shape,
-                        default_size,
-                        min_size,
-                        data_format,
-                        require_flatten,
-                        weights=None):
-  """Internal utility to compute/validate a model's input shape.
-
-  Arguments:
-      input_shape: Either None (will return the default network input shape),
-          or a user-provided shape to be validated.
-      default_size: Default input width/height for the model.
-      min_size: Minimum input width/height accepted by the model.
-      data_format: Image data format to use.
-      require_flatten: Whether the model is expected to
-          be linked to a classifier via a Flatten layer.
-      weights: One of `None` (random initialization)
-          or 'imagenet' (pre-training on ImageNet).
-          If weights='imagenet' input channels must be equal to 3.
-
-  Returns:
-      An integer shape tuple (may include None entries).
-
-  Raises:
-      ValueError: In case of invalid argument values.
-  """
-  if weights != 'imagenet' and input_shape and len(input_shape) == 3:
-    if data_format == 'channels_first':
-      if input_shape[0] not in {1, 3}:
-        logging.warning('This model usually expects 1 or 3 input channels. '
-                        'However, it was passed an input_shape with ' +
-                        str(input_shape[0]) + ' input channels.')
-      default_shape = (input_shape[0], default_size, default_size)
-    else:
-      if input_shape[-1] not in {1, 3}:
-        logging.warning('This model usually expects 1 or 3 input channels. '
-                        'However, it was passed an input_shape with ' +
-                        str(input_shape[-1]) + ' input channels.')
-      default_shape = (default_size, default_size, input_shape[-1])
-  else:
-    if data_format == 'channels_first':
-      default_shape = (3, default_size, default_size)
-    else:
-      default_shape = (default_size, default_size, 3)
-  if weights == 'imagenet' and require_flatten:
-    if input_shape is not None:
-      if input_shape != default_shape:
-        raise ValueError('When setting`include_top=True` '
-                         'and loading `imagenet` weights, '
-                         '`input_shape` should be ' + str(default_shape) + '.')
-    return default_shape
-  if input_shape:
-    if data_format == 'channels_first':
-      if input_shape is not None:
-        if len(input_shape) != 3:
-          raise ValueError('`input_shape` must be a tuple of three integers.')
-        if input_shape[0] != 3 and weights == 'imagenet':
-          raise ValueError('The input must have 3 channels; got '
-                           '`input_shape=' + str(input_shape) + '`')
-        if ((input_shape[1] is not None and input_shape[1] < min_size) or
-            (input_shape[2] is not None and input_shape[2] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) +
-                           'x' + str(min_size) + '; got '
-                           '`input_shape=' + str(input_shape) + '`')
-    else:
-      if input_shape is not None:
-        if len(input_shape) != 3:
-          raise ValueError('`input_shape` must be a tuple of three integers.')
-        if input_shape[-1] != 3 and weights == 'imagenet':
-          raise ValueError('The input must have 3 channels; got '
-                           '`input_shape=' + str(input_shape) + '`')
-        if ((input_shape[0] is not None and input_shape[0] < min_size) or
-            (input_shape[1] is not None and input_shape[1] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) +
-                           'x' + str(min_size) + '; got '
-                           '`input_shape=' + str(input_shape) + '`')
-  else:
-    if require_flatten:
-      input_shape = default_shape
-    else:
-      if data_format == 'channels_first':
-        input_shape = (3, None, None)
-      else:
-        input_shape = (None, None, 3)
-  if require_flatten:
-    if None in input_shape:
-      raise ValueError('If `include_top` is True, '
-                       'you should specify a static `input_shape`. '
-                       'Got `input_shape=' + str(input_shape) + '`')
-  return input_shape
+tf_export(
+    'keras.applications.imagenet_utils.decode_predictions',
+    'keras.applications.densenet.decode_predictions',
+    'keras.applications.inception_resnet_v2.decode_predictions',
+    'keras.applications.inception_v3.decode_predictions',
+    'keras.applications.mobilenet.decode_predictions',
+    'keras.applications.mobilenet_v2.decode_predictions',
+    'keras.applications.nasnet.decode_predictions',
+    'keras.applications.resnet50.decode_predictions',
+    'keras.applications.vgg16.decode_predictions',
+    'keras.applications.vgg19.decode_predictions',
+    'keras.applications.xception.decode_predictions',
+)(decode_predictions)
+tf_export(
+    'keras.applications.imagenet_utils.preprocess_input',
+    'keras.applications.resnet50.preprocess_input',
+    'keras.applications.vgg16.preprocess_input',
+    'keras.applications.vgg19.preprocess_input',
+)(preprocess_input)
diff --git a/tensorflow/python/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/applications/imagenet_utils_test.py
deleted file mode 100644
index 3493393..0000000
--- a/tensorflow/python/keras/applications/imagenet_utils_test.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Inception V3 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.platform import test
-
-
-class ImageNetUtilsTest(test.TestCase):
-
-  def test_preprocess_input(self):
-    # Test batch of images
-    x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    self.assertEqual(preprocess_input(x).shape, x.shape)
-    out1 = preprocess_input(x, 'channels_last')
-    out2 = preprocess_input(np.transpose(x, (0, 3, 1, 2)), 'channels_first')
-    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
-
-    # Test single image
-    x = np.random.uniform(0, 255, (10, 10, 3))
-    self.assertEqual(preprocess_input(x).shape, x.shape)
-    out1 = preprocess_input(x, 'channels_last')
-    out2 = preprocess_input(np.transpose(x, (2, 0, 1)), 'channels_first')
-    self.assertAllClose(out1, out2.transpose(1, 2, 0))
-
-  def test_preprocess_input_symbolic(self):
-    # Test image batch
-    x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    inputs = keras.layers.Input(shape=x.shape[1:])
-    outputs = keras.layers.Lambda(
-        preprocess_input, output_shape=x.shape[1:])(inputs)
-    model = keras.models.Model(inputs, outputs)
-    assert model.predict(x).shape == x.shape
-    # pylint: disable=g-long-lambda
-    outputs1 = keras.layers.Lambda(lambda x:
-                                   preprocess_input(x, 'channels_last'),
-                                   output_shape=x.shape[1:])(inputs)
-    model1 = keras.models.Model(inputs, outputs1)
-    out1 = model1.predict(x)
-    x2 = np.transpose(x, (0, 3, 1, 2))
-    inputs2 = keras.layers.Input(shape=x2.shape[1:])
-    # pylint: disable=g-long-lambda
-    outputs2 = keras.layers.Lambda(lambda x:
-                                   preprocess_input(x, 'channels_first'),
-                                   output_shape=x2.shape[1:])(inputs2)
-    model2 = keras.models.Model(inputs2, outputs2)
-    out2 = model2.predict(x2)
-    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
-
-    # Test single image
-    x = np.random.uniform(0, 255, (10, 10, 3))
-    inputs = keras.layers.Input(shape=x.shape)
-    outputs = keras.layers.Lambda(preprocess_input,
-                                  output_shape=x.shape)(inputs)
-    model = keras.models.Model(inputs, outputs)
-    assert model.predict(x[np.newaxis])[0].shape == x.shape
-    # pylint: disable=g-long-lambda
-    outputs1 = keras.layers.Lambda(lambda x:
-                                   preprocess_input(x, 'channels_last'),
-                                   output_shape=x.shape)(inputs)
-    model1 = keras.models.Model(inputs, outputs1)
-    out1 = model1.predict(x[np.newaxis])[0]
-    x2 = np.transpose(x, (2, 0, 1))
-    inputs2 = keras.layers.Input(shape=x2.shape)
-    outputs2 = keras.layers.Lambda(lambda x:
-                                   preprocess_input(x, 'channels_first'),
-                                   output_shape=x2.shape)(inputs2)  # pylint: disable=g-long-lambda
-    model2 = keras.models.Model(inputs2, outputs2)
-    out2 = model2.predict(x2[np.newaxis])[0]
-    self.assertAllClose(out1, out2.transpose(1, 2, 0))
-
-  def test_obtain_input_shape(self):
-    # input_shape and default_size are not identical.
-    with self.assertRaises(ValueError):
-      keras.applications.imagenet_utils._obtain_input_shape(
-          input_shape=(224, 224, 3),
-          default_size=299,
-          min_size=139,
-          data_format='channels_last',
-          require_flatten=True,
-          weights='imagenet')
-
-    # Test invalid use cases
-    for data_format in ['channels_last', 'channels_first']:
-      # input_shape is smaller than min_size.
-      shape = (100, 100)
-      if data_format == 'channels_last':
-        input_shape = shape + (3,)
-      else:
-        input_shape = (3,) + shape
-      with self.assertRaises(ValueError):
-        keras.applications.imagenet_utils._obtain_input_shape(
-            input_shape=input_shape,
-            default_size=None,
-            min_size=139,
-            data_format=data_format,
-            require_flatten=False)
-
-      # shape is 1D.
-      shape = (100,)
-      if data_format == 'channels_last':
-        input_shape = shape + (3,)
-      else:
-        input_shape = (3,) + shape
-      with self.assertRaises(ValueError):
-        keras.applications.imagenet_utils._obtain_input_shape(
-            input_shape=input_shape,
-            default_size=None,
-            min_size=139,
-            data_format=data_format,
-            require_flatten=False)
-
-      # the number of channels is 5 not 3.
-      shape = (100, 100)
-      if data_format == 'channels_last':
-        input_shape = shape + (5,)
-      else:
-        input_shape = (5,) + shape
-      with self.assertRaises(ValueError):
-        keras.applications.imagenet_utils._obtain_input_shape(
-            input_shape=input_shape,
-            default_size=None,
-            min_size=139,
-            data_format=data_format,
-            require_flatten=False)
-
-      # require_flatten=True with dynamic input shape.
-      with self.assertRaises(ValueError):
-        keras.applications.imagenet_utils._obtain_input_shape(
-            input_shape=None,
-            default_size=None,
-            min_size=139,
-            data_format='channels_first',
-            require_flatten=True)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=(3, 200, 200),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=True) == (3, 200, 200)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (None, None, 3)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False) == (3, None, None)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=None,
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (None, None, 3)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=(150, 150, 3),
-        default_size=None,
-        min_size=139,
-        data_format='channels_last',
-        require_flatten=False) == (150, 150, 3)
-
-    assert keras.applications.imagenet_utils._obtain_input_shape(
-        input_shape=(3, None, None),
-        default_size=None,
-        min_size=139,
-        data_format='channels_first',
-        require_flatten=False) == (3, None, None)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 14e3b6a..63debb4 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -13,372 +13,20 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """Inception-ResNet V2 model for Keras.
-
-# Reference
-- [Inception-v4, Inception-ResNet and the Impact of
-   Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
-
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications import imagenet_utils
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import AveragePooling2D
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Concatenate
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import Lambda
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import inception_resnet_v2
 from tensorflow.python.util.tf_export import tf_export
 
+InceptionResNetV2 = inception_resnet_v2.InceptionResNetV2
+decode_predictions = inception_resnet_v2.decode_predictions
+preprocess_input = inception_resnet_v2.preprocess_input
 
-BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
-
-
-@tf_export('keras.applications.inception_resnet_v2.preprocess_input')
-def preprocess_input(x):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments:
-      x: a 4D numpy array consists of RGB values within [0, 255].
-
-  Returns:
-      Preprocessed array.
-  """
-  return imagenet_utils.preprocess_input(x, mode='tf')
-
-
-def conv2d_bn(x,
-              filters,
-              kernel_size,
-              strides=1,
-              padding='same',
-              activation='relu',
-              use_bias=False,
-              name=None):
-  """Utility function to apply conv + BN.
-
-  Arguments:
-      x: input tensor.
-      filters: filters in `Conv2D`.
-      kernel_size: kernel size as in `Conv2D`.
-      strides: strides in `Conv2D`.
-      padding: padding mode in `Conv2D`.
-      activation: activation in `Conv2D`.
-      use_bias: whether to use a bias in `Conv2D`.
-      name: name of the ops; will become `name + '_ac'` for the activation
-          and `name + '_bn'` for the batch norm layer.
-
-  Returns:
-      Output tensor after applying `Conv2D` and `BatchNormalization`.
-  """
-  x = Conv2D(
-      filters,
-      kernel_size,
-      strides=strides,
-      padding=padding,
-      use_bias=use_bias,
-      name=name)(
-          x)
-  if not use_bias:
-    bn_axis = 1 if K.image_data_format() == 'channels_first' else 3
-    bn_name = None if name is None else name + '_bn'
-    x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-  if activation is not None:
-    ac_name = None if name is None else name + '_ac'
-    x = Activation(activation, name=ac_name)(x)
-  return x
-
-
-def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
-  """Adds a Inception-ResNet block.
-
-  This function builds 3 types of Inception-ResNet blocks mentioned
-  in the paper, controlled by the `block_type` argument (which is the
-  block name used in the official TF-slim implementation):
-      - Inception-ResNet-A: `block_type='block35'`
-      - Inception-ResNet-B: `block_type='block17'`
-      - Inception-ResNet-C: `block_type='block8'`
-
-  Arguments:
-      x: input tensor.
-      scale: scaling factor to scale the residuals (i.e., the output of
-          passing `x` through an inception module) before adding them
-          to the shortcut branch. Let `r` be the output from the residual
-            branch,
-          the output of this block will be `x + scale * r`.
-      block_type: `'block35'`, `'block17'` or `'block8'`, determines
-          the network structure in the residual branch.
-      block_idx: an `int` used for generating layer names. The Inception-ResNet
-        blocks
-          are repeated many times in this network. We use `block_idx` to
-            identify
-          each of the repetitions. For example, the first Inception-ResNet-A
-            block
-          will have `block_type='block35', block_idx=0`, ane the layer names
-            will have
-          a common prefix `'block35_0'`.
-      activation: activation function to use at the end of the block.
-          When `activation=None`, no activation is applied
-          (i.e., "linear" activation: `a(x) = x`).
-
-  Returns:
-      Output tensor for the block.
-
-  Raises:
-      ValueError: if `block_type` is not one of `'block35'`,
-          `'block17'` or `'block8'`.
-  """
-  if block_type == 'block35':
-    branch_0 = conv2d_bn(x, 32, 1)
-    branch_1 = conv2d_bn(x, 32, 1)
-    branch_1 = conv2d_bn(branch_1, 32, 3)
-    branch_2 = conv2d_bn(x, 32, 1)
-    branch_2 = conv2d_bn(branch_2, 48, 3)
-    branch_2 = conv2d_bn(branch_2, 64, 3)
-    branches = [branch_0, branch_1, branch_2]
-  elif block_type == 'block17':
-    branch_0 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(x, 128, 1)
-    branch_1 = conv2d_bn(branch_1, 160, [1, 7])
-    branch_1 = conv2d_bn(branch_1, 192, [7, 1])
-    branches = [branch_0, branch_1]
-  elif block_type == 'block8':
-    branch_0 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(x, 192, 1)
-    branch_1 = conv2d_bn(branch_1, 224, [1, 3])
-    branch_1 = conv2d_bn(branch_1, 256, [3, 1])
-    branches = [branch_0, branch_1]
-  else:
-    raise ValueError('Unknown Inception-ResNet block type. '
-                     'Expects "block35", "block17" or "block8", '
-                     'but got: ' + str(block_type))
-
-  block_name = block_type + '_' + str(block_idx)
-  channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
-  mixed = Concatenate(axis=channel_axis, name=block_name + '_mixed')(branches)
-  up = conv2d_bn(
-      mixed,
-      K.int_shape(x)[channel_axis],
-      1,
-      activation=None,
-      use_bias=True,
-      name=block_name + '_conv')
-
-  x = Lambda(
-      lambda inputs, scale: inputs[0] + inputs[1] * scale,
-      output_shape=K.int_shape(x)[1:],
-      arguments={'scale': scale},
-      name=block_name)([x, up])
-  if activation is not None:
-    x = Activation(activation, name=block_name + '_ac')(x)
-  return x
-
-
-@tf_export('keras.applications.InceptionResNetV2',
-           'keras.applications.inception_resnet_v2.InceptionResNetV2')
-def InceptionResNetV2(include_top=True,
-                      weights='imagenet',
-                      input_tensor=None,
-                      input_shape=None,
-                      pooling=None,
-                      classes=1000):
-  """Instantiates the Inception-ResNet v2 architecture.
-
-  Optionally loads weights pre-trained on ImageNet.
-  Note that when using TensorFlow, for best performance you should
-  set `"image_data_format": "channels_last"` in your Keras config
-  at `~/.keras/keras.json`.
-
-  The model and the weights are compatible with TensorFlow, Theano and
-  CNTK backends. The data format convention used by the model is
-  the one specified in your Keras config file.
-
-  Note that the default input image size for this model is 299x299, instead
-  of 224x224 as in the VGG16 and ResNet models. Also, the input preprocessing
-  function is different (i.e., do not use `imagenet_utils.preprocess_input()`
-  with this model. Use `preprocess_input()` defined in this module instead).
-
-  Arguments:
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is `False` (otherwise the input shape
-          has to be `(299, 299, 3)` (with `'channels_last'` data format)
-          or `(3, 299, 299)` (with `'channels_first'` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 139.
-          E.g. `(150, 150, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the last convolutional layer.
-          - `'avg'` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `'max'` means that global max pooling will be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is `True`, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras `Model` instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=139,
-      data_format=K.image_data_format(),
-      require_flatten=False,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  # Stem block: 35 x 35 x 192
-  x = conv2d_bn(img_input, 32, 3, strides=2, padding='valid')
-  x = conv2d_bn(x, 32, 3, padding='valid')
-  x = conv2d_bn(x, 64, 3)
-  x = MaxPooling2D(3, strides=2)(x)
-  x = conv2d_bn(x, 80, 1, padding='valid')
-  x = conv2d_bn(x, 192, 3, padding='valid')
-  x = MaxPooling2D(3, strides=2)(x)
-
-  # Mixed 5b (Inception-A block): 35 x 35 x 320
-  branch_0 = conv2d_bn(x, 96, 1)
-  branch_1 = conv2d_bn(x, 48, 1)
-  branch_1 = conv2d_bn(branch_1, 64, 5)
-  branch_2 = conv2d_bn(x, 64, 1)
-  branch_2 = conv2d_bn(branch_2, 96, 3)
-  branch_2 = conv2d_bn(branch_2, 96, 3)
-  branch_pool = AveragePooling2D(3, strides=1, padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1)
-  branches = [branch_0, branch_1, branch_2, branch_pool]
-  channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
-  x = Concatenate(axis=channel_axis, name='mixed_5b')(branches)
-
-  # 10x block35 (Inception-ResNet-A block): 35 x 35 x 320
-  for block_idx in range(1, 11):
-    x = inception_resnet_block(
-        x, scale=0.17, block_type='block35', block_idx=block_idx)
-
-  # Mixed 6a (Reduction-A block): 17 x 17 x 1088
-  branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='valid')
-  branch_1 = conv2d_bn(x, 256, 1)
-  branch_1 = conv2d_bn(branch_1, 256, 3)
-  branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='valid')
-  branch_pool = MaxPooling2D(3, strides=2, padding='valid')(x)
-  branches = [branch_0, branch_1, branch_pool]
-  x = Concatenate(axis=channel_axis, name='mixed_6a')(branches)
-
-  # 20x block17 (Inception-ResNet-B block): 17 x 17 x 1088
-  for block_idx in range(1, 21):
-    x = inception_resnet_block(
-        x, scale=0.1, block_type='block17', block_idx=block_idx)
-
-  # Mixed 7a (Reduction-B block): 8 x 8 x 2080
-  branch_0 = conv2d_bn(x, 256, 1)
-  branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding='valid')
-  branch_1 = conv2d_bn(x, 256, 1)
-  branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding='valid')
-  branch_2 = conv2d_bn(x, 256, 1)
-  branch_2 = conv2d_bn(branch_2, 288, 3)
-  branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding='valid')
-  branch_pool = MaxPooling2D(3, strides=2, padding='valid')(x)
-  branches = [branch_0, branch_1, branch_2, branch_pool]
-  x = Concatenate(axis=channel_axis, name='mixed_7a')(branches)
-
-  # 10x block8 (Inception-ResNet-C block): 8 x 8 x 2080
-  for block_idx in range(1, 10):
-    x = inception_resnet_block(
-        x, scale=0.2, block_type='block8', block_idx=block_idx)
-  x = inception_resnet_block(
-      x, scale=1., activation=None, block_type='block8', block_idx=10)
-
-  # Final convolution block: 8 x 8 x 1536
-  x = conv2d_bn(x, 1536, 1, name='conv_7b')
-
-  if include_top:
-    # Classification block
-    x = GlobalAveragePooling2D(name='avg_pool')(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model
-  model = Model(inputs, x, name='inception_resnet_v2')
-
-  # Load weights
-  if weights == 'imagenet':
-    if include_top:
-      fname = 'inception_resnet_v2_weights_tf_dim_ordering_tf_kernels.h5'
-      weights_path = get_file(
-          fname,
-          BASE_WEIGHT_URL + fname,
-          cache_subdir='models',
-          file_hash='e693bd0210a403b3192acc6073ad2e96')
-    else:
-      fname = 'inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5'
-      weights_path = get_file(
-          fname,
-          BASE_WEIGHT_URL + fname,
-          cache_subdir='models',
-          file_hash='d19885ff4a710c122648d3b5c3b684e4')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
+          'keras.applications.InceptionResNetV2')(InceptionResNetV2)
+tf_export(
+    'keras.applications.inception_resnet_v2.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2_test.py b/tensorflow/python/keras/applications/inception_resnet_v2_test.py
deleted file mode 100644
index 0a12f88..0000000
--- a/tensorflow/python/keras/applications/inception_resnet_v2_test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Inception V3 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class InceptionResNetV2Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.InceptionResNetV2(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.InceptionResNetV2(weights=None,
-                                                 include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1536))
-
-  def test_with_pooling(self):
-    model = keras.applications.InceptionResNetV2(weights=None,
-                                                 include_top=False,
-                                                 pooling='avg')
-    self.assertEqual(model.output_shape, (None, 1536))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.InceptionResNetV2(weights='unknown',
-                                           include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.InceptionResNetV2(weights='imagenet',
-                                           classes=2000)
-
-  def test_preprocess_input(self):
-    x = np.random.uniform(0, 255, (2, 300, 200, 3))
-    out1 = keras.applications.inception_resnet_v2.preprocess_input(x)
-    self.assertAllClose(np.mean(out1), 0., atol=0.1)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index b5e28c7..8753408 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -13,404 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """Inception V3 model for Keras.
-
-Note that the input image format for this model is different than for
-the VGG16 and ResNet models (299x299 instead of 224x224),
-and that the input preprocessing function is also different (same as Xception).
-
-# Reference
-
-- [Rethinking the Inception Architecture for Computer
-Vision](http://arxiv.org/abs/1512.00567)
-
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import layers
-from tensorflow.python.keras.applications import imagenet_utils
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import AveragePooling2D
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import inception_v3
 from tensorflow.python.util.tf_export import tf_export
 
+InceptionV3 = inception_v3.InceptionV3
+decode_predictions = inception_v3.decode_predictions
+preprocess_input = inception_v3.preprocess_input
 
-WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
-WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-def conv2d_bn(x,
-              filters,
-              num_row,
-              num_col,
-              padding='same',
-              strides=(1, 1),
-              name=None):
-  """Utility function to apply conv + BN.
-
-  Arguments:
-      x: input tensor.
-      filters: filters in `Conv2D`.
-      num_row: height of the convolution kernel.
-      num_col: width of the convolution kernel.
-      padding: padding mode in `Conv2D`.
-      strides: strides in `Conv2D`.
-      name: name of the ops; will become `name + '_conv'`
-          for the convolution and `name + '_bn'` for the
-          batch norm layer.
-
-  Returns:
-      Output tensor after applying `Conv2D` and `BatchNormalization`.
-  """
-  if name is not None:
-    bn_name = name + '_bn'
-    conv_name = name + '_conv'
-  else:
-    bn_name = None
-    conv_name = None
-  if K.image_data_format() == 'channels_first':
-    bn_axis = 1
-  else:
-    bn_axis = 3
-  x = Conv2D(
-      filters, (num_row, num_col),
-      strides=strides,
-      padding=padding,
-      use_bias=False,
-      name=conv_name)(
-          x)
-  x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-  x = Activation('relu', name=name)(x)
-  return x
-
-
-@tf_export('keras.applications.InceptionV3',
-           'keras.applications.inception_v3.InceptionV3')
-def InceptionV3(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
-  """Instantiates the Inception v3 architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. Note that when using TensorFlow,
-  for best performance you should set
-  `image_data_format='channels_last'` in your Keras config
-  at ~/.keras/keras.json.
-  The model and the weights are compatible with both
-  TensorFlow and Theano. The data format
-  convention used by the model is the one
-  specified in your Keras config file.
-  Note that the default input image size for this model is 299x299.
-
-  Arguments:
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(299, 299, 3)` (with `channels_last` data format)
-          or `(3, 299, 299)` (with `channels_first` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 139.
-          E.g. `(150, 150, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=139,
-      data_format=K.image_data_format(),
-      require_flatten=False,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  if K.image_data_format() == 'channels_first':
-    channel_axis = 1
-  else:
-    channel_axis = 3
-
-  x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='valid')
-  x = conv2d_bn(x, 32, 3, 3, padding='valid')
-  x = conv2d_bn(x, 64, 3, 3)
-  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-  x = conv2d_bn(x, 80, 1, 1, padding='valid')
-  x = conv2d_bn(x, 192, 3, 3, padding='valid')
-  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-  # mixed 0, 1, 2: 35 x 35 x 256
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
-  x = layers.concatenate(
-      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-      axis=channel_axis,
-      name='mixed0')
-
-  # mixed 1: 35 x 35 x 256
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
-  x = layers.concatenate(
-      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-      axis=channel_axis,
-      name='mixed1')
-
-  # mixed 2: 35 x 35 x 256
-  branch1x1 = conv2d_bn(x, 64, 1, 1)
-
-  branch5x5 = conv2d_bn(x, 48, 1, 1)
-  branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-
-  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
-  x = layers.concatenate(
-      [branch1x1, branch5x5, branch3x3dbl, branch_pool],
-      axis=channel_axis,
-      name='mixed2')
-
-  # mixed 3: 17 x 17 x 768
-  branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='valid')
-
-  branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-  branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-  branch3x3dbl = conv2d_bn(
-      branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='valid')
-
-  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
-  x = layers.concatenate(
-      [branch3x3, branch3x3dbl, branch_pool], axis=channel_axis, name='mixed3')
-
-  # mixed 4: 17 x 17 x 768
-  branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-  branch7x7 = conv2d_bn(x, 128, 1, 1)
-  branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
-  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-  branch7x7dbl = conv2d_bn(x, 128, 1, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-  x = layers.concatenate(
-      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-      axis=channel_axis,
-      name='mixed4')
-
-  # mixed 5, 6: 17 x 17 x 768
-  for i in range(2):
-    branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-    branch7x7 = conv2d_bn(x, 160, 1, 1)
-    branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
-    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-    branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-    x = layers.concatenate(
-        [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-        axis=channel_axis,
-        name='mixed' + str(5 + i))
-
-  # mixed 7: 17 x 17 x 768
-  branch1x1 = conv2d_bn(x, 192, 1, 1)
-
-  branch7x7 = conv2d_bn(x, 192, 1, 1)
-  branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
-  branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
-
-  branch7x7dbl = conv2d_bn(x, 192, 1, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-  branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-
-  branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-  branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-  x = layers.concatenate(
-      [branch1x1, branch7x7, branch7x7dbl, branch_pool],
-      axis=channel_axis,
-      name='mixed7')
-
-  # mixed 8: 8 x 8 x 1280
-  branch3x3 = conv2d_bn(x, 192, 1, 1)
-  branch3x3 = conv2d_bn(branch3x3, 320, 3, 3, strides=(2, 2), padding='valid')
-
-  branch7x7x3 = conv2d_bn(x, 192, 1, 1)
-  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
-  branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
-  branch7x7x3 = conv2d_bn(
-      branch7x7x3, 192, 3, 3, strides=(2, 2), padding='valid')
-
-  branch_pool = MaxPooling2D((3, 3), strides=(2, 2))(x)
-  x = layers.concatenate(
-      [branch3x3, branch7x7x3, branch_pool], axis=channel_axis, name='mixed8')
-
-  # mixed 9: 8 x 8 x 2048
-  for i in range(2):
-    branch1x1 = conv2d_bn(x, 320, 1, 1)
-
-    branch3x3 = conv2d_bn(x, 384, 1, 1)
-    branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
-    branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
-    branch3x3 = layers.concatenate(
-        [branch3x3_1, branch3x3_2], axis=channel_axis, name='mixed9_' + str(i))
-
-    branch3x3dbl = conv2d_bn(x, 448, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
-    branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
-    branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
-    branch3x3dbl = layers.concatenate(
-        [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis)
-
-    branch_pool = AveragePooling2D((3, 3), strides=(1, 1), padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
-    x = layers.concatenate(
-        [branch1x1, branch3x3, branch3x3dbl, branch_pool],
-        axis=channel_axis,
-        name='mixed' + str(9 + i))
-  if include_top:
-    # Classification block
-    x = GlobalAveragePooling2D(name='avg_pool')(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = Model(inputs, x, name='inception_v3')
-
-  # load weights
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = get_file(
-          'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='9a0d58056eeedaa3f26cb7ebd46da564')
-    else:
-      weights_path = get_file(
-          'inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='bcbd6486424b2319ff4ef7d526e38f63')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
-
-
-@tf_export('keras.applications.nasnet.preprocess_input',
-           'keras.applications.inception_v3.preprocess_input')
-def preprocess_input(x):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments:
-      x: a 4D numpy array consists of RGB values within [0, 255].
-
-  Returns:
-      Preprocessed array.
-  """
-  return imagenet_utils.preprocess_input(x, mode='tf')
+tf_export('keras.applications.inception_v3.InceptionV3',
+          'keras.applications.InceptionV3')(InceptionV3)
+tf_export('keras.applications.inception_v3.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/inception_v3_test.py b/tensorflow/python/keras/applications/inception_v3_test.py
deleted file mode 100644
index a3fcdd5..0000000
--- a/tensorflow/python/keras/applications/inception_v3_test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Inception V3 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class InceptionV3Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.InceptionV3(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.InceptionV3(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 2048))
-
-  def test_with_pooling(self):
-    model = keras.applications.InceptionV3(weights=None,
-                                           include_top=False,
-                                           pooling='avg')
-    self.assertEqual(model.output_shape, (None, 2048))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.InceptionV3(weights='unknown',
-                                     include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.InceptionV3(weights='imagenet',
-                                     classes=2000)
-
-  def test_preprocess_input(self):
-    x = np.random.uniform(0, 255, (2, 300, 200, 3))
-    out1 = keras.applications.inception_v3.preprocess_input(x)
-    self.assertAllClose(np.mean(out1), 0., atol=0.1)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 7285e03..3528f02 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -13,466 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """MobileNet v1 models for Keras.
-
-MobileNet is a general architecture and can be used for multiple use cases.
-Depending on the use case, it can use different input layer size and
-different width factors. This allows different width models to reduce
-the number of multiply-adds and thereby
-reduce inference cost on mobile devices.
-
-MobileNets support any input size greater than 32 x 32, with larger image sizes
-offering better performance.
-The number of parameters and number of multiply-adds
-can be modified by using the `alpha` parameter,
-which increases/decreases the number of filters in each layer.
-By altering the image size and `alpha` parameter,
-all 16 models from the paper can be built, with ImageNet weights provided.
-
-The paper demonstrates the performance of MobileNets using `alpha` values of
-1.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25.
-For each of these `alpha` values, weights for 4 different input image sizes
-are provided (224, 192, 160, 128).
-
-The following table describes the size and accuracy of the 100% MobileNet
-on size 224 x 224:
-----------------------------------------------------------------------------
-Width Multiplier (alpha) | ImageNet Acc |  Multiply-Adds (M) |  Params (M)
-----------------------------------------------------------------------------
-|   1.0 MobileNet-224    |    70.6 %     |        529        |     4.2     |
-|   0.75 MobileNet-224   |    68.4 %     |        325        |     2.6     |
-|   0.50 MobileNet-224   |    63.7 %     |        149        |     1.3     |
-|   0.25 MobileNet-224   |    50.6 %     |        41         |     0.5     |
-----------------------------------------------------------------------------
-
-The following table describes the performance of
-the 100 % MobileNet on various input sizes:
-------------------------------------------------------------------------
-      Resolution      | ImageNet Acc | Multiply-Adds (M) | Params (M)
-------------------------------------------------------------------------
-|  1.0 MobileNet-224  |    70.6 %    |        529        |     4.2     |
-|  1.0 MobileNet-192  |    69.1 %    |        529        |     4.2     |
-|  1.0 MobileNet-160  |    67.2 %    |        529        |     4.2     |
-|  1.0 MobileNet-128  |    64.4 %    |        529        |     4.2     |
-------------------------------------------------------------------------
-
-The weights for all 16 models are obtained and translated
-from TensorFlow checkpoints found at
-https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md
-
-# Reference
-- [MobileNets: Efficient Convolutional Neural Networks for
-   Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf))
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications import imagenet_utils
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import DepthwiseConv2D
-from tensorflow.python.keras.layers import Dropout
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import ReLU
-from tensorflow.python.keras.layers import Reshape
-from tensorflow.python.keras.layers import ZeroPadding2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import mobilenet
 from tensorflow.python.util.tf_export import tf_export
 
+MobileNet = mobilenet.MobileNet
+decode_predictions = mobilenet.decode_predictions
+preprocess_input = mobilenet.preprocess_input
 
-BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
-
-
-@tf_export('keras.applications.mobilenet.preprocess_input')
-def preprocess_input(x):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments:
-      x: a 4D numpy array consists of RGB values within [0, 255].
-
-  Returns:
-      Preprocessed array.
-  """
-  return imagenet_utils.preprocess_input(x, mode='tf')
-
-
-@tf_export('keras.applications.MobileNet',
-           'keras.applications.mobilenet.MobileNet')
-def MobileNet(input_shape=None,
-              alpha=1.0,
-              depth_multiplier=1,
-              dropout=1e-3,
-              include_top=True,
-              weights='imagenet',
-              input_tensor=None,
-              pooling=None,
-              classes=1000):
-  """Instantiates the MobileNet architecture.
-
-  Arguments:
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` (with `channels_last` data format)
-          or (3, 224, 224) (with `channels_first` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(200, 200, 3)` would be one valid value.
-      alpha: controls the width of the network.
-          - If `alpha` < 1.0, proportionally decreases the number
-              of filters in each layer.
-          - If `alpha` > 1.0, proportionally increases the number
-              of filters in each layer.
-          - If `alpha` = 1, default number of filters from the paper
-               are used at each layer.
-      depth_multiplier: depth multiplier for depthwise convolution
-          (also called the resolution multiplier)
-      dropout: dropout rate
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as ImageNet with `include_top` '
-                     'as true, `classes` should be 1000')
-
-  # Determine proper input shape and default size.
-  if input_shape is None:
-    default_size = 224
-  else:
-    if K.image_data_format() == 'channels_first':
-      rows = input_shape[1]
-      cols = input_shape[2]
-    else:
-      rows = input_shape[0]
-      cols = input_shape[1]
-
-    if rows == cols and rows in [128, 160, 192, 224]:
-      default_size = rows
-    else:
-      default_size = 224
-
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=K.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if K.image_data_format() == 'channels_last':
-    row_axis, col_axis = (0, 1)
-  else:
-    row_axis, col_axis = (1, 2)
-  rows = input_shape[row_axis]
-  cols = input_shape[col_axis]
-
-  if weights == 'imagenet':
-    if depth_multiplier != 1:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'depth multiplier must be 1')
-
-    if alpha not in [0.25, 0.50, 0.75, 1.0]:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'alpha can be one of'
-                       '`0.25`, `0.50`, `0.75` or `1.0` only.')
-
-    if rows != cols or rows not in [128, 160, 192, 224]:
-      if rows is None:
-        rows = 224
-        logging.warning('MobileNet shape is undefined.'
-                        ' Weights for input shape (224, 224) will be loaded.')
-      else:
-        raise ValueError('If imagenet weights are being loaded, '
-                         'input must have a static square shape (one of '
-                         '(128, 128), (160, 160), (192, 192), or (224, 224)).'
-                         ' Input shape provided = %s' % (input_shape,))
-
-  if K.image_data_format() != 'channels_last':
-    logging.warning('The MobileNet family of models is only available '
-                    'for the input data format "channels_last" '
-                    '(width, height, channels). '
-                    'However your settings specify the default '
-                    'data format "channels_first" (channels, width, height).'
-                    ' You should set `image_data_format="channels_last"` '
-                    'in your Keras config located at ~/.keras/keras.json. '
-                    'The model being returned right now will expect inputs '
-                    'to follow the "channels_last" data format.')
-    K.set_image_data_format('channels_last')
-    old_data_format = 'channels_first'
-  else:
-    old_data_format = None
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  x = _conv_block(img_input, 32, alpha, strides=(2, 2))
-  x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
-
-  x = _depthwise_conv_block(
-      x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2)
-  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
-
-  x = _depthwise_conv_block(
-      x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4)
-  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
-
-  x = _depthwise_conv_block(
-      x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
-  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
-
-  x = _depthwise_conv_block(
-      x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12)
-  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
-
-  if include_top:
-    if K.image_data_format() == 'channels_first':
-      shape = (int(1024 * alpha), 1, 1)
-    else:
-      shape = (1, 1, int(1024 * alpha))
-
-    x = GlobalAveragePooling2D()(x)
-    x = Reshape(shape, name='reshape_1')(x)
-    x = Dropout(dropout, name='dropout')(x)
-    x = Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x)
-    x = Activation('softmax', name='act_softmax')(x)
-    x = Reshape((classes,), name='reshape_2')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  # Create model.
-  model = Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows))
-
-  # load weights
-  if weights == 'imagenet':
-    if K.image_data_format() == 'channels_first':
-      raise ValueError('Weights for "channels_first" format '
-                       'are not available.')
-    if alpha == 1.0:
-      alpha_text = '1_0'
-    elif alpha == 0.75:
-      alpha_text = '7_5'
-    elif alpha == 0.50:
-      alpha_text = '5_0'
-    else:
-      alpha_text = '2_5'
-
-    if include_top:
-      model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
-      weigh_path = BASE_WEIGHT_PATH + model_name
-      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
-    else:
-      model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
-      weigh_path = BASE_WEIGHT_PATH + model_name
-      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  if old_data_format:
-    K.set_image_data_format(old_data_format)
-  return model
-
-
-def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
-  """Adds an initial convolution layer (with batch normalization and relu6).
-
-  Arguments:
-      inputs: Input tensor of shape `(rows, cols, 3)`
-          (with `channels_last` data format) or
-          (3, rows, cols) (with `channels_first` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      alpha: controls the width of the network.
-          - If `alpha` < 1.0, proportionally decreases the number
-              of filters in each layer.
-          - If `alpha` > 1.0, proportionally increases the number
-              of filters in each layer.
-          - If `alpha` = 1, default number of filters from the paper
-               are used at each layer.
-      kernel: An integer or tuple/list of 2 integers, specifying the
-          width and height of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-
-  Input shape:
-      4D tensor with shape:
-      `(samples, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(samples, rows, cols, channels)` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to stride.
-
-  Returns:
-      Output tensor of block.
-  """
-  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
-  filters = int(filters * alpha)
-  x = ZeroPadding2D(padding=(1, 1), name='conv1_pad')(inputs)
-  x = Conv2D(
-      filters,
-      kernel,
-      padding='valid',
-      use_bias=False,
-      strides=strides,
-      name='conv1')(x)
-  x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
-  return ReLU(6, name='conv1_relu')(x)
-
-
-def _depthwise_conv_block(inputs,
-                          pointwise_conv_filters,
-                          alpha,
-                          depth_multiplier=1,
-                          strides=(1, 1),
-                          block_id=1):
-  """Adds a depthwise convolution block.
-
-  A depthwise convolution block consists of a depthwise conv,
-  batch normalization, relu6, pointwise convolution,
-  batch normalization and relu6 activation.
-
-  Arguments:
-      inputs: Input tensor of shape `(rows, cols, channels)`
-          (with `channels_last` data format) or
-          (channels, rows, cols) (with `channels_first` data format).
-      pointwise_conv_filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the pointwise convolution).
-      alpha: controls the width of the network.
-          - If `alpha` < 1.0, proportionally decreases the number
-              of filters in each layer.
-          - If `alpha` > 1.0, proportionally increases the number
-              of filters in each layer.
-          - If `alpha` = 1, default number of filters from the paper
-               are used at each layer.
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      block_id: Integer, a unique identification designating the block number.
-
-  Input shape:
-      4D tensor with shape:
-      `(batch, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, rows, cols, channels)` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to stride.
-
-  Returns:
-      Output tensor of block.
-  """
-  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
-  pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-  x = ZeroPadding2D(padding=(1, 1), name='conv_pad_%d' % block_id)(inputs)
-  x = DepthwiseConv2D(  # pylint: disable=not-callable
-      (3, 3),
-      padding='valid',
-      depth_multiplier=depth_multiplier,
-      strides=strides,
-      use_bias=False,
-      name='conv_dw_%d' % block_id)(x)
-  x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
-  x = ReLU(6, name='conv_dw_%d_relu' % block_id)(x)
-
-  x = Conv2D(
-      pointwise_conv_filters, (1, 1),
-      padding='same',
-      use_bias=False,
-      strides=(1, 1),
-      name='conv_pw_%d' % block_id)(
-          x)
-  x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
-  return ReLU(6, name='conv_pw_%d_relu' % block_id)(x)
+tf_export('keras.applications.mobilenet.MobileNet',
+          'keras.applications.MobileNet')(MobileNet)
+tf_export('keras.applications.mobilenet.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/mobilenet_test.py b/tensorflow/python/keras/applications/mobilenet_test.py
deleted file mode 100644
index 5661ed7..0000000
--- a/tensorflow/python/keras/applications/mobilenet_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MobileNet application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class MobileNetTest(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.MobileNet(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.MobileNet(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1024))
-
-  def test_with_pooling(self):
-    model = keras.applications.MobileNet(weights=None,
-                                         include_top=False,
-                                         pooling='avg')
-    self.assertEqual(model.output_shape, (None, 1024))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.MobileNet(weights='unknown',
-                                   include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.MobileNet(weights='imagenet',
-                                   classes=2000)
-
-  def test_preprocess_input(self):
-    x = np.random.uniform(0, 255, (2, 300, 200, 3))
-    out1 = keras.applications.mobilenet.preprocess_input(x)
-    self.assertAllClose(np.mean(out1), 0., atol=0.1)
-
-  def test_invalid_use_cases(self):
-    keras.backend.set_image_data_format('channels_first')
-    model = keras.applications.MobileNet(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-    keras.backend.set_image_data_format('channels_last')
-
-  def test_mobilenet_variable_input_channels(self):
-    input_shape = (None, None, 1)
-    model = keras.applications.MobileNet(weights=None,
-                                         include_top=False,
-                                         input_shape=input_shape)
-    self.assertEqual(model.output_shape, (None, None, None, 1024))
-
-    input_shape = (None, None, 4)
-    model = keras.applications.MobileNet(weights=None,
-                                         include_top=False,
-                                         input_shape=input_shape)
-    self.assertEqual(model.output_shape, (None, None, None, 1024))
-
-  def test_mobilenet_image_size(self):
-    with self.test_session():
-      valid_image_sizes = [128, 160, 192, 224]
-      for size in valid_image_sizes:
-        keras.backend.set_image_data_format('channels_last')
-        input_shape = (size, size, 3)
-        model = keras.applications.MobileNet(input_shape=input_shape,
-                                             weights=None,
-                                             include_top=True)
-        self.assertEqual(model.input_shape, (None,) + input_shape)
-
-        keras.backend.set_image_data_format('channels_first')
-        input_shape = (3, size, size)
-        model = keras.applications.MobileNet(input_shape=input_shape,
-                                             weights=None,
-                                             include_top=True)
-        self.assertEqual(model.input_shape, (None,) + input_shape)
-
-      keras.backend.set_image_data_format('channels_last')
-      invalid_image_shape = (112, 112, 3)
-      with self.assertRaises(ValueError):
-        model = keras.applications.MobileNet(input_shape=invalid_image_shape,
-                                             weights='imagenet',
-                                             include_top=True)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/lite/build_rpi_lib.sh b/tensorflow/python/keras/applications/mobilenet_v2.py
old mode 100755
new mode 100644
similarity index 65%
copy from tensorflow/contrib/lite/build_rpi_lib.sh
copy to tensorflow/python/keras/applications/mobilenet_v2.py
index 3824b16..9194c3e
--- a/tensorflow/contrib/lite/build_rpi_lib.sh
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -1,5 +1,4 @@
-#!/bin/bash -x
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+"""MobileNet v2 models for Keras.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../.."
-
-CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/Makefile TARGET=RPI TARGET_ARCH=armv7
+# TODO(fchollet): export MobileNetV2 as part of the public API in next version.
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index ff79b3a..26ff5db 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -12,784 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=line-too-long
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """NASNet-A models for Keras.
-
-NASNet refers to Neural Architecture Search Network, a family of models
-that were designed automatically by learning the model architectures
-directly on the dataset of interest.
-
-Here we consider NASNet-A, the highest performance model that was found
-for the CIFAR-10 dataset, and then extended to ImageNet 2012 dataset,
-obtaining state of the art performance on CIFAR-10 and ImageNet 2012.
-Only the NASNet-A models, and their respective weights, which are suited
-for ImageNet 2012 are provided.
-
-The below table describes the performance on ImageNet 2012:
---------------------------------------------------------------------------------
-      Architecture       | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
---------------------------------------------------------------------------------
-|   NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3    |
-|   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
---------------------------------------------------------------------------------
-
-References:
- - [Learning Transferable Architectures for Scalable Image Recognition]
-    (https://arxiv.org/abs/1707.07012)
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.applications.inception_v3 import preprocess_input
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import add
-from tensorflow.python.keras.layers import AveragePooling2D
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import concatenate
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Cropping2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.layers import SeparableConv2D
-from tensorflow.python.keras.layers import ZeroPadding2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import nasnet
 from tensorflow.python.util.tf_export import tf_export
 
+NASNetMobile = nasnet.NASNetMobile
+NASNetLarge = nasnet.NASNetLarge
+decode_predictions = nasnet.decode_predictions
+preprocess_input = nasnet.preprocess_input
 
-NASNET_MOBILE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile.h5'
-NASNET_MOBILE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile-no-top.h5'
-NASNET_LARGE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large.h5'
-NASNET_LARGE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large-no-top.h5'
-
-
-def NASNet(input_shape=None,
-           penultimate_filters=4032,
-           num_blocks=6,
-           stem_block_filters=96,
-           skip_reduction=True,
-           filter_multiplier=2,
-           include_top=True,
-           weights=None,
-           input_tensor=None,
-           pooling=None,
-           classes=1000,
-           default_size=None):
-  """Instantiates a NASNet model.
-
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
-  Arguments:
-      input_shape: Optional shape tuple, the input shape
-          is by default `(331, 331, 3)` for NASNetLarge and
-          `(224, 224, 3)` for NASNetMobile.
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
-      penultimate_filters: Number of filters in the penultimate layer.
-          NASNet models use the notation `NASNet (N @ P)`, where:
-              -   N is the number of blocks
-              -   P is the number of penultimate filters
-      num_blocks: Number of repeated blocks of the NASNet model.
-          NASNet models use the notation `NASNet (N @ P)`, where:
-              -   N is the number of blocks
-              -   P is the number of penultimate filters
-      stem_block_filters: Number of filters in the initial stem block
-      skip_reduction: Whether to skip the reduction step at the tail
-          end of the network. Set to `False` for CIFAR models.
-      filter_multiplier: Controls the width of the network.
-          - If `filter_multiplier` < 1.0, proportionally decreases the number
-              of filters in each layer.
-          - If `filter_multiplier` > 1.0, proportionally increases the number
-              of filters in each layer.
-          - If `filter_multiplier` = 1, default number of filters from the
-               paper are used at each layer.
-      include_top: Whether to include the fully-connected
-          layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
-      input_tensor: Optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: Optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-      default_size: Specifies the default image size of the model
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: In case of invalid argument for `weights`,
-          invalid input shape or invalid `penultimate_filters` value.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only Tensorflow backend is currently supported, '
-                       'as other backends do not support '
-                       'separable convolution.')
-
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as ImageNet with `include_top` '
-                     'as true, `classes` should be 1000')
-
-  if (isinstance(input_shape, tuple) and None in input_shape and
-      weights == 'imagenet'):
-    raise ValueError('When specifying the input shape of a NASNet'
-                     ' and loading `ImageNet` weights, '
-                     'the input_shape argument must be static '
-                     '(no None entries). Got: `input_shape=' +
-                     str(input_shape) + '`.')
-
-  if default_size is None:
-    default_size = 331
-
-  # Determine proper input shape and default size.
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=default_size,
-      min_size=32,
-      data_format=K.image_data_format(),
-      require_flatten=False,
-      weights=weights)
-
-  if K.image_data_format() != 'channels_last':
-    logging.warning('The NASNet family of models is only available '
-                    'for the input data format "channels_last" '
-                    '(width, height, channels). '
-                    'However your settings specify the default '
-                    'data format "channels_first" (channels, width, height).'
-                    ' You should set `image_data_format="channels_last"` '
-                    'in your Keras config located at ~/.keras/keras.json. '
-                    'The model being returned right now will expect inputs '
-                    'to follow the "channels_last" data format.')
-    K.set_image_data_format('channels_last')
-    old_data_format = 'channels_first'
-  else:
-    old_data_format = None
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  if penultimate_filters % 24 != 0:
-    raise ValueError(
-        'For NASNet-A models, the value of `penultimate_filters` '
-        'needs to be divisible by 24. Current value: %d' % penultimate_filters)
-
-  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
-  filters = penultimate_filters // 24
-
-  if not skip_reduction:
-    x = Conv2D(
-        stem_block_filters, (3, 3),
-        strides=(2, 2),
-        padding='valid',
-        use_bias=False,
-        name='stem_conv1',
-        kernel_initializer='he_normal')(
-            img_input)
-  else:
-    x = Conv2D(
-        stem_block_filters, (3, 3),
-        strides=(1, 1),
-        padding='same',
-        use_bias=False,
-        name='stem_conv1',
-        kernel_initializer='he_normal')(
-            img_input)
-
-  x = BatchNormalization(
-      axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='stem_bn1')(
-          x)
-
-  p = None
-  if not skip_reduction:  # imagenet / mobile mode
-    x, p = _reduction_a_cell(
-        x, p, filters // (filter_multiplier**2), block_id='stem_1')
-    x, p = _reduction_a_cell(
-        x, p, filters // filter_multiplier, block_id='stem_2')
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(x, p, filters, block_id='%d' % (i))
-
-  x, p0 = _reduction_a_cell(
-      x, p, filters * filter_multiplier, block_id='reduce_%d' % (num_blocks))
-
-  p = p0 if not skip_reduction else p
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(
-        x, p, filters * filter_multiplier, block_id='%d' % (num_blocks + i + 1))
-
-  x, p0 = _reduction_a_cell(
-      x,
-      p,
-      filters * filter_multiplier**2,
-      block_id='reduce_%d' % (2 * num_blocks))
-
-  p = p0 if not skip_reduction else p
-
-  for i in range(num_blocks):
-    x, p = _normal_a_cell(
-        x,
-        p,
-        filters * filter_multiplier**2,
-        block_id='%d' % (2 * num_blocks + i + 1))
-
-  x = Activation('relu')(x)
-
-  if include_top:
-    x = GlobalAveragePooling2D()(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-
-  model = Model(inputs, x, name='NASNet')
-
-  # load weights
-  if weights == 'imagenet':
-    if default_size == 224:  # mobile version
-      if include_top:
-        weight_path = NASNET_MOBILE_WEIGHT_PATH
-        model_name = 'nasnet_mobile.h5'
-      else:
-        weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
-        model_name = 'nasnet_mobile_no_top.h5'
-
-      weights_file = get_file(model_name, weight_path, cache_subdir='models')
-      model.load_weights(weights_file)
-
-    elif default_size == 331:  # large version
-      if include_top:
-        weight_path = NASNET_LARGE_WEIGHT_PATH
-        model_name = 'nasnet_large.h5'
-      else:
-        weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
-        model_name = 'nasnet_large_no_top.h5'
-
-      weights_file = get_file(model_name, weight_path, cache_subdir='models')
-      model.load_weights(weights_file)
-    else:
-      raise ValueError('ImageNet weights can only be loaded with NASNetLarge'
-                       ' or NASNetMobile')
-  elif weights is not None:
-    model.load_weights(weights)
-
-  if old_data_format:
-    K.set_image_data_format(old_data_format)
-
-  return model
-
-
-@tf_export('keras.applications.NASNetLarge',
-           'keras.applications.nasnet.NASNetLarge')
-def NASNetLarge(input_shape=None,
-                include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                pooling=None,
-                classes=1000):
-  """Instantiates a NASNet model in ImageNet mode.
-
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
-  Arguments:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(331, 331, 3)` for NASNetLarge.
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
-      include_top: Whether to include the fully-connected
-          layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
-      input_tensor: Optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: Optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  return NASNet(
-      input_shape,
-      penultimate_filters=4032,
-      num_blocks=6,
-      stem_block_filters=96,
-      skip_reduction=False,
-      filter_multiplier=2,
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classes=classes,
-      default_size=331)
-
-
-@tf_export('keras.applications.NASNetMobile',
-           'keras.applications.nasnet.NASNetMobile')
-def NASNetMobile(input_shape=None,
-                 include_top=True,
-                 weights='imagenet',
-                 input_tensor=None,
-                 pooling=None,
-                 classes=1000):
-  """Instantiates a Mobile NASNet model in ImageNet mode.
-
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
-  Arguments:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` for NASNetMobile
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 32.
-          E.g. `(224, 224, 3)` would be one valid value.
-      include_top: Whether to include the fully-connected
-          layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
-      input_tensor: Optional Keras tensor (i.e. output of
-          `layers.Input()`)
-          to use as image input for the model.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model
-              will be the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a
-              2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: Optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: In case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  return NASNet(
-      input_shape,
-      penultimate_filters=1056,
-      num_blocks=4,
-      stem_block_filters=32,
-      skip_reduction=False,
-      filter_multiplier=2,
-      include_top=include_top,
-      weights=weights,
-      input_tensor=input_tensor,
-      pooling=pooling,
-      classes=classes,
-      default_size=224)
-
-
-def _separable_conv_block(ip,
-                          filters,
-                          kernel_size=(3, 3),
-                          strides=(1, 1),
-                          block_id=None):
-  """Adds 2 blocks of [relu-separable conv-batchnorm].
-
-  Arguments:
-      ip: Input tensor
-      filters: Number of output filters per layer
-      kernel_size: Kernel size of separable convolutions
-      strides: Strided convolution for downsampling
-      block_id: String block_id
-
-  Returns:
-      A Keras tensor
-  """
-  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
-
-  with K.name_scope('separable_conv_block_%s' % block_id):
-    x = Activation('relu')(ip)
-    x = SeparableConv2D(
-        filters,
-        kernel_size,
-        strides=strides,
-        name='separable_conv_1_%s' % block_id,
-        padding='same',
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            x)
-    x = BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='separable_conv_1_bn_%s' % (block_id))(
-            x)
-    x = Activation('relu')(x)
-    x = SeparableConv2D(
-        filters,
-        kernel_size,
-        name='separable_conv_2_%s' % block_id,
-        padding='same',
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            x)
-    x = BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='separable_conv_2_bn_%s' % (block_id))(
-            x)
-  return x
-
-
-def _adjust_block(p, ip, filters, block_id=None):
-  """Adjusts the input `previous path` to match the shape of the `input`.
-
-  Used in situations where the output number of filters needs to be changed.
-
-  Arguments:
-      p: Input tensor which needs to be modified
-      ip: Input tensor whose shape needs to be matched
-      filters: Number of output filters to be matched
-      block_id: String block_id
-
-  Returns:
-      Adjusted Keras tensor
-  """
-  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
-  img_dim = 2 if K.image_data_format() == 'channels_first' else -2
-
-  ip_shape = K.int_shape(ip)
-
-  if p is not None:
-    p_shape = K.int_shape(p)
-
-  with K.name_scope('adjust_block'):
-    if p is None:
-      p = ip
-
-    elif p_shape[img_dim] != ip_shape[img_dim]:
-      with K.name_scope('adjust_reduction_block_%s' % block_id):
-        p = Activation('relu', name='adjust_relu_1_%s' % block_id)(p)
-
-        p1 = AveragePooling2D(
-            (1, 1),
-            strides=(2, 2),
-            padding='valid',
-            name='adjust_avg_pool_1_%s' % block_id)(
-                p)
-        p1 = Conv2D(
-            filters // 2, (1, 1),
-            padding='same',
-            use_bias=False,
-            name='adjust_conv_1_%s' % block_id,
-            kernel_initializer='he_normal')(
-                p1)
-
-        p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
-        p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
-        p2 = AveragePooling2D(
-            (1, 1),
-            strides=(2, 2),
-            padding='valid',
-            name='adjust_avg_pool_2_%s' % block_id)(
-                p2)
-        p2 = Conv2D(
-            filters // 2, (1, 1),
-            padding='same',
-            use_bias=False,
-            name='adjust_conv_2_%s' % block_id,
-            kernel_initializer='he_normal')(
-                p2)
-
-        p = concatenate([p1, p2], axis=channel_dim)
-        p = BatchNormalization(
-            axis=channel_dim,
-            momentum=0.9997,
-            epsilon=1e-3,
-            name='adjust_bn_%s' % block_id)(
-                p)
-
-    elif p_shape[channel_dim] != filters:
-      with K.name_scope('adjust_projection_block_%s' % block_id):
-        p = Activation('relu')(p)
-        p = Conv2D(
-            filters, (1, 1),
-            strides=(1, 1),
-            padding='same',
-            name='adjust_conv_projection_%s' % block_id,
-            use_bias=False,
-            kernel_initializer='he_normal')(
-                p)
-        p = BatchNormalization(
-            axis=channel_dim,
-            momentum=0.9997,
-            epsilon=1e-3,
-            name='adjust_bn_%s' % block_id)(
-                p)
-  return p
-
-
-def _normal_a_cell(ip, p, filters, block_id=None):
-  """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
-
-  Arguments:
-      ip: Input tensor `x`
-      p: Input tensor `p`
-      filters: Number of output filters
-      block_id: String block_id
-
-  Returns:
-      A Keras tensor
-  """
-  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
-
-  with K.name_scope('normal_A_block_%s' % block_id):
-    p = _adjust_block(p, ip, filters, block_id)
-
-    h = Activation('relu')(ip)
-    h = Conv2D(
-        filters, (1, 1),
-        strides=(1, 1),
-        padding='same',
-        name='normal_conv_1_%s' % block_id,
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            h)
-    h = BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='normal_bn_1_%s' % block_id)(
-            h)
-
-    with K.name_scope('block_1'):
-      x1_1 = _separable_conv_block(
-          h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id)
-      x1_2 = _separable_conv_block(
-          p, filters, block_id='normal_right1_%s' % block_id)
-      x1 = add([x1_1, x1_2], name='normal_add_1_%s' % block_id)
-
-    with K.name_scope('block_2'):
-      x2_1 = _separable_conv_block(
-          p, filters, (5, 5), block_id='normal_left2_%s' % block_id)
-      x2_2 = _separable_conv_block(
-          p, filters, (3, 3), block_id='normal_right2_%s' % block_id)
-      x2 = add([x2_1, x2_2], name='normal_add_2_%s' % block_id)
-
-    with K.name_scope('block_3'):
-      x3 = AveragePooling2D(
-          (3, 3),
-          strides=(1, 1),
-          padding='same',
-          name='normal_left3_%s' % (block_id))(
-              h)
-      x3 = add([x3, p], name='normal_add_3_%s' % block_id)
-
-    with K.name_scope('block_4'):
-      x4_1 = AveragePooling2D(
-          (3, 3),
-          strides=(1, 1),
-          padding='same',
-          name='normal_left4_%s' % (block_id))(
-              p)
-      x4_2 = AveragePooling2D(
-          (3, 3),
-          strides=(1, 1),
-          padding='same',
-          name='normal_right4_%s' % (block_id))(
-              p)
-      x4 = add([x4_1, x4_2], name='normal_add_4_%s' % block_id)
-
-    with K.name_scope('block_5'):
-      x5 = _separable_conv_block(
-          h, filters, block_id='normal_left5_%s' % block_id)
-      x5 = add([x5, h], name='normal_add_5_%s' % block_id)
-
-    x = concatenate(
-        [p, x1, x2, x3, x4, x5],
-        axis=channel_dim,
-        name='normal_concat_%s' % block_id)
-  return x, ip
-
-
-def _reduction_a_cell(ip, p, filters, block_id=None):
-  """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
-
-  Arguments:
-      ip: Input tensor `x`
-      p: Input tensor `p`
-      filters: Number of output filters
-      block_id: String block_id
-
-  Returns:
-      A Keras tensor
-  """
-  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
-
-  with K.name_scope('reduction_A_block_%s' % block_id):
-    p = _adjust_block(p, ip, filters, block_id)
-
-    h = Activation('relu')(ip)
-    h = Conv2D(
-        filters, (1, 1),
-        strides=(1, 1),
-        padding='same',
-        name='reduction_conv_1_%s' % block_id,
-        use_bias=False,
-        kernel_initializer='he_normal')(
-            h)
-    h = BatchNormalization(
-        axis=channel_dim,
-        momentum=0.9997,
-        epsilon=1e-3,
-        name='reduction_bn_1_%s' % block_id)(
-            h)
-
-    with K.name_scope('block_1'):
-      x1_1 = _separable_conv_block(
-          h,
-          filters, (5, 5),
-          strides=(2, 2),
-          block_id='reduction_left1_%s' % block_id)
-      x1_2 = _separable_conv_block(
-          p,
-          filters, (7, 7),
-          strides=(2, 2),
-          block_id='reduction_1_%s' % block_id)
-      x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % block_id)
-
-    with K.name_scope('block_2'):
-      x2_1 = MaxPooling2D(
-          (3, 3),
-          strides=(2, 2),
-          padding='same',
-          name='reduction_left2_%s' % block_id)(
-              h)
-      x2_2 = _separable_conv_block(
-          p,
-          filters, (7, 7),
-          strides=(2, 2),
-          block_id='reduction_right2_%s' % block_id)
-      x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % block_id)
-
-    with K.name_scope('block_3'):
-      x3_1 = AveragePooling2D(
-          (3, 3),
-          strides=(2, 2),
-          padding='same',
-          name='reduction_left3_%s' % block_id)(
-              h)
-      x3_2 = _separable_conv_block(
-          p,
-          filters, (5, 5),
-          strides=(2, 2),
-          block_id='reduction_right3_%s' % block_id)
-      x3 = add([x3_1, x3_2], name='reduction_add3_%s' % block_id)
-
-    with K.name_scope('block_4'):
-      x4 = AveragePooling2D(
-          (3, 3),
-          strides=(1, 1),
-          padding='same',
-          name='reduction_left4_%s' % block_id)(
-              x1)
-      x4 = add([x2, x4])
-
-    with K.name_scope('block_5'):
-      x5_1 = _separable_conv_block(
-          x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id)
-      x5_2 = MaxPooling2D(
-          (3, 3),
-          strides=(2, 2),
-          padding='same',
-          name='reduction_right5_%s' % block_id)(
-              h)
-      x5 = add([x5_1, x5_2], name='reduction_add4_%s' % block_id)
-
-    x = concatenate(
-        [x2, x3, x4, x5],
-        axis=channel_dim,
-        name='reduction_concat_%s' % block_id)
-    return x, ip
+tf_export('keras.applications.nasnet.NASNetMobile',
+          'keras.applications.NASNetMobile')(NASNetMobile)
+tf_export('keras.applications.nasnet.NASNetLarge',
+          'keras.applications.NASNetLarge')(NASNetLarge)
+tf_export('keras.applications.nasnet.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/nasnet_test.py b/tensorflow/python/keras/applications/nasnet_test.py
deleted file mode 100644
index f96c3aa..0000000
--- a/tensorflow/python/keras/applications/nasnet_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Nasnet application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class NASNetMobileTest(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.NASNetMobile(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.NASNetMobile(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 1056))
-
-  def test_with_pooling(self):
-    model = keras.applications.NASNetMobile(weights=None,
-                                            include_top=False,
-                                            pooling='avg')
-    self.assertEqual(model.output_shape, (None, 1056))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.NASNetMobile(weights='unknown',
-                                      include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.NASNetMobile(weights='imagenet',
-                                      classes=2000)
-
-
-class NASNetLargeTest(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.NASNetLarge(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.NASNetLarge(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 4032))
-
-  def test_with_pooling(self):
-    model = keras.applications.NASNetLarge(weights=None,
-                                           include_top=False,
-                                           pooling='avg')
-    self.assertEqual(model.output_shape, (None, 4032))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.NASNetLarge(weights='unknown',
-                                     include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.NASNetLarge(weights='imagenet',
-                                     classes=2000)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py
index 6afc086..4d804a3 100644
--- a/tensorflow/python/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/applications/resnet50.py
@@ -13,291 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """ResNet50 model for Keras.
-
-# Reference:
-
-- [Deep Residual Learning for Image
-Recognition](https://arxiv.org/abs/1512.03385)
-
-Adapted from code contributed by BigMoyan.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import layers
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import AveragePooling2D
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import Flatten
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.layers import ZeroPadding2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import resnet50
 from tensorflow.python.util.tf_export import tf_export
 
+ResNet50 = resnet50.ResNet50
+decode_predictions = resnet50.decode_predictions
+preprocess_input = resnet50.preprocess_input
 
-WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
-WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-def identity_block(input_tensor, kernel_size, filters, stage, block):
-  """The identity block is the block that has no conv layer at shortcut.
-
-  Arguments:
-      input_tensor: input tensor
-      kernel_size: default 3, the kernel size of middle conv layer at main path
-      filters: list of integers, the filters of 3 conv layer at main path
-      stage: integer, current stage label, used for generating layer names
-      block: 'a','b'..., current block label, used for generating layer names
-
-  Returns:
-      Output tensor for the block.
-  """
-  filters1, filters2, filters3 = filters
-  if K.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = Conv2D(filters1, (1, 1), name=conv_name_base + '2a')(input_tensor)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
-  x = Activation('relu')(x)
-
-  x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
-          x)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
-  x = Activation('relu')(x)
-
-  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
-
-  x = layers.add([x, input_tensor])
-  x = Activation('relu')(x)
-  return x
-
-
-def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
-                                                                          2)):
-  """A block that has a conv layer at shortcut.
-
-  Arguments:
-      input_tensor: input tensor
-      kernel_size: default 3, the kernel size of middle conv layer at main path
-      filters: list of integers, the filters of 3 conv layer at main path
-      stage: integer, current stage label, used for generating layer names
-      block: 'a','b'..., current block label, used for generating layer names
-      strides: Strides for the first conv layer in the block.
-
-  Returns:
-      Output tensor for the block.
-
-  Note that from stage 3,
-  the first conv layer at main path is with strides=(2, 2)
-  And the shortcut should have strides=(2, 2) as well
-  """
-  filters1, filters2, filters3 = filters
-  if K.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = Conv2D(
-      filters1, (1, 1), strides=strides, name=conv_name_base + '2a')(
-          input_tensor)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
-  x = Activation('relu')(x)
-
-  x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
-          x)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
-  x = Activation('relu')(x)
-
-  x = Conv2D(filters3, (1, 1), name=conv_name_base + '2c')(x)
-  x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
-
-  shortcut = Conv2D(
-      filters3, (1, 1), strides=strides, name=conv_name_base + '1')(
-          input_tensor)
-  shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
-
-  x = layers.add([x, shortcut])
-  x = Activation('relu')(x)
-  return x
-
-
-@tf_export('keras.applications.ResNet50',
-           'keras.applications.resnet50.ResNet50')
-def ResNet50(include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000):
-  """Instantiates the ResNet50 architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. Note that when using TensorFlow,
-  for best performance you should set
-  `image_data_format='channels_last'` in your Keras config
-  at ~/.keras/keras.json.
-
-  The model and the weights are compatible with both
-  TensorFlow and Theano. The data format
-  convention used by the model is the one
-  specified in your Keras config file.
-
-  Arguments:
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 197.
-          E.g. `(200, 200, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=197,
-      data_format=K.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-  if K.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-
-  x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(img_input)
-  x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
-  x = Activation('relu')(x)
-  x = MaxPooling2D((3, 3), strides=(2, 2))(x)
-
-  x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
-  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
-
-  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
-  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
-
-  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
-  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
-
-  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
-  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
-
-  x = AveragePooling2D((7, 7), name='avg_pool')(x)
-
-  if include_top:
-    x = Flatten()(x)
-    x = Dense(classes, activation='softmax', name='fc1000')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = Model(inputs, x, name='resnet50')
-
-  # load weights
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = get_file(
-          'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          md5_hash='a7b3fe01876f51b976af0dea6bc144eb')
-    else:
-      weights_path = get_file(
-          'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          md5_hash='a268eb855778b3df3c7506639542a6af')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+tf_export('keras.applications.resnet50.ResNet50',
+          'keras.applications.ResNet50')(ResNet50)
diff --git a/tensorflow/python/keras/applications/resnet50_test.py b/tensorflow/python/keras/applications/resnet50_test.py
deleted file mode 100644
index 22a3f05..0000000
--- a/tensorflow/python/keras/applications/resnet50_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for ResNet50 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class ResNet50Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.ResNet50(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.ResNet50(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 2048))
-
-  def test_with_pooling(self):
-    model = keras.applications.ResNet50(weights=None,
-                                        include_top=False,
-                                        pooling='avg')
-    self.assertEqual(model.output_shape, (None, 2048))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.ResNet50(weights='unknown',
-                                  include_top=False)
-
-    with self.assertRaises(ValueError):
-      keras.applications.ResNet50(weights='imagenet',
-                                  classes=2000)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index cef0230..c420d9b 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -13,217 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """VGG16 model for Keras.
-
-# Reference
-
-- [Very Deep Convolutional Networks for Large-Scale Image
-Recognition](https://arxiv.org/abs/1409.1556)
-
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import Flatten
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import vgg16
 from tensorflow.python.util.tf_export import tf_export
 
+VGG16 = vgg16.VGG16
+decode_predictions = vgg16.decode_predictions
+preprocess_input = vgg16.preprocess_input
 
-WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
-WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-@tf_export('keras.applications.VGG16', 'keras.applications.vgg16.VGG16')
-def VGG16(include_top=True,
-          weights='imagenet',
-          input_tensor=None,
-          input_shape=None,
-          pooling=None,
-          classes=1000):
-  """Instantiates the VGG16 architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. Note that when using TensorFlow,
-  for best performance you should set
-  `image_data_format='channels_last'` in your Keras config
-  at ~/.keras/keras.json.
-
-  The model and the weights are compatible with both
-  TensorFlow and Theano. The data format
-  convention used by the model is the one
-  specified in your Keras config file.
-
-  Arguments:
-      include_top: whether to include the 3 fully-connected
-          layers at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
-          and width and height should be no smaller than 48.
-          E.g. `(200, 200, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=48,
-      data_format=K.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-  # Block 1
-  x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
-          img_input)
-  x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
-
-  # Block 2
-  x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
-          x)
-  x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
-
-  # Block 3
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
-          x)
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
-          x)
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
-
-  # Block 4
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
-
-  # Block 5
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
-
-  if include_top:
-    # Classification block
-    x = Flatten(name='flatten')(x)
-    x = Dense(4096, activation='relu', name='fc1')(x)
-    x = Dense(4096, activation='relu', name='fc2')(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = Model(inputs, x, name='vgg16')
-
-  # load weights
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = get_file(
-          'vgg16_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='64373286793e3c8b2b4e3219cbf3544b')
-    else:
-      weights_path = get_file(
-          'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='6d6bbae143d832006294945121d1f1fc')
-    model.load_weights(weights_path)
-
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+tf_export('keras.applications.vgg16.VGG16',
+          'keras.applications.VGG16')(VGG16)
diff --git a/tensorflow/python/keras/applications/vgg16_test.py b/tensorflow/python/keras/applications/vgg16_test.py
deleted file mode 100644
index cad6576..0000000
--- a/tensorflow/python/keras/applications/vgg16_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for VGG16 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class VGG16Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.VGG16(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.VGG16(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 512))
-
-  def test_with_pooling(self):
-    model = keras.applications.VGG16(weights=None,
-                                     include_top=False,
-                                     pooling='avg')
-    self.assertEqual(model.output_shape, (None, 512))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.VGG16(weights='unknown',
-                               include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.VGG16(weights='imagenet',
-                               classes=2000)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index c4031f5..73d3d1d 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -13,226 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """VGG19 model for Keras.
-
-# Reference
-
-- [Very Deep Convolutional Networks for Large-Scale Image
-Recognition](https://arxiv.org/abs/1409.1556)
-
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import Flatten
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import vgg19
 from tensorflow.python.util.tf_export import tf_export
 
+VGG19 = vgg19.VGG19
+decode_predictions = vgg19.decode_predictions
+preprocess_input = vgg19.preprocess_input
 
-WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
-WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-@tf_export('keras.applications.VGG19', 'keras.applications.vgg19.VGG19')
-def VGG19(include_top=True,
-          weights='imagenet',
-          input_tensor=None,
-          input_shape=None,
-          pooling=None,
-          classes=1000):
-  """Instantiates the VGG19 architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. Note that when using TensorFlow,
-  for best performance you should set
-  `image_data_format='channels_last'` in your Keras config
-  at ~/.keras/keras.json.
-
-  The model and the weights are compatible with both
-  TensorFlow and Theano. The data format
-  convention used by the model is the one
-  specified in your Keras config file.
-
-  Arguments:
-      include_top: whether to include the 3 fully-connected
-          layers at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(224, 224, 3)` (with `channels_last` data format)
-          or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 48.
-          E.g. `(200, 200, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=224,
-      min_size=48,
-      data_format=K.image_data_format(),
-      require_flatten=include_top,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-  # Block 1
-  x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
-          img_input)
-  x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
-
-  # Block 2
-  x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
-          x)
-  x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
-
-  # Block 3
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
-          x)
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
-          x)
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
-          x)
-  x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
-
-  # Block 4
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
-
-  # Block 5
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
-          x)
-  x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(
-          x)
-  x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
-
-  if include_top:
-    # Classification block
-    x = Flatten(name='flatten')(x)
-    x = Dense(4096, activation='relu', name='fc1')(x)
-    x = Dense(4096, activation='relu', name='fc2')(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = Model(inputs, x, name='vgg19')
-
-  # load weights
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = get_file(
-          'vgg19_weights_tf_dim_ordering_tf_kernels.h5',
-          WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='cbe5617147190e668d6c5d5026f83318')
-    else:
-      weights_path = get_file(
-          'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='253f8cb515780f3b799900260a226db6')
-    model.load_weights(weights_path)
-
-  elif weights is not None:
-    model.load_weights(weights)
-
-  return model
+tf_export('keras.applications.vgg19.VGG19',
+          'keras.applications.VGG19')(VGG19)
diff --git a/tensorflow/python/keras/applications/vgg19_test.py b/tensorflow/python/keras/applications/vgg19_test.py
deleted file mode 100644
index 61dccc0..0000000
--- a/tensorflow/python/keras/applications/vgg19_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for VGG19 application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class VGG19Test(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.VGG19(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.VGG19(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 512))
-
-  def test_with_pooling(self):
-    model = keras.applications.VGG19(weights=None,
-                                     include_top=False,
-                                     pooling='avg')
-    self.assertEqual(model.output_shape, (None, 512))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.VGG19(weights='unknown',
-                               include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.VGG19(weights='imagenet',
-                               classes=2000)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 01397cf..5b221ac 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -13,332 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-# pylint: disable=unused-import
 """Xception V1 model for Keras.
-
-On ImageNet, this model gets to a top-1 validation accuracy of 0.790
-and a top-5 validation accuracy of 0.945.
-
-Do note that the input image format for this model is different than for
-the VGG16 and ResNet models (299x299 instead of 224x224),
-and that the input preprocessing function
-is also different (same as Inception V3).
-
-Also do note that this model is only available for the TensorFlow backend,
-due to its reliance on `SeparableConvolution` layers.
-
-# Reference
-
-- [Xception: Deep Learning with Depthwise Separable
-Convolutions](https://arxiv.org/abs/1610.02357)
-
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import layers
-from tensorflow.python.keras.applications import imagenet_utils
-from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras.layers import Activation
-from tensorflow.python.keras.layers import BatchNormalization
-from tensorflow.python.keras.layers import Conv2D
-from tensorflow.python.keras.layers import Dense
-from tensorflow.python.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras.layers import Input
-from tensorflow.python.keras.layers import MaxPooling2D
-from tensorflow.python.keras.layers import SeparableConv2D
-from tensorflow.python.keras.models import Model
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.platform import tf_logging as logging
+from keras_applications import xception
 from tensorflow.python.util.tf_export import tf_export
 
+Xception = xception.Xception
+decode_predictions = xception.decode_predictions
+preprocess_input = xception.preprocess_input
 
-TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
-TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
-
-
-@tf_export('keras.applications.Xception',
-           'keras.applications.xception.Xception')
-def Xception(include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000):
-  """Instantiates the Xception architecture.
-
-  Optionally loads weights pre-trained
-  on ImageNet. This model is available for TensorFlow only,
-  and can only be used with inputs following the TensorFlow
-  data format `(width, height, channels)`.
-  You should set `image_data_format='channels_last'` in your Keras config
-  located at ~/.keras/keras.json.
-
-  Note that the default input image size for this model is 299x299.
-
-  Arguments:
-      include_top: whether to include the fully-connected
-          layer at the top of the network.
-      weights: one of `None` (random initialization),
-            'imagenet' (pre-training on ImageNet),
-            or the path to the weights file to be loaded.
-      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-          to use as image input for the model.
-      input_shape: optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(299, 299, 3)`.
-          It should have exactly 3 inputs channels,
-          and width and height should be no smaller than 71.
-          E.g. `(150, 150, 3)` would be one valid value.
-      pooling: Optional pooling mode for feature extraction
-          when `include_top` is `False`.
-          - `None` means that the output of the model will be
-              the 4D tensor output of the
-              last convolutional layer.
-          - `avg` means that global average pooling
-              will be applied to the output of the
-              last convolutional layer, and thus
-              the output of the model will be a 2D tensor.
-          - `max` means that global max pooling will
-              be applied.
-      classes: optional number of classes to classify images
-          into, only to be specified if `include_top` is True, and
-          if no `weights` argument is specified.
-
-  Returns:
-      A Keras model instance.
-
-  Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
-      RuntimeError: If attempting to run this model with a
-          backend that does not support separable convolutions.
-  """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
-    raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization), `imagenet` '
-                     '(pre-training on ImageNet), '
-                     'or the path to the weights file to be loaded.')
-
-  if weights == 'imagenet' and include_top and classes != 1000:
-    raise ValueError('If using `weights` as imagenet with `include_top`'
-                     ' as true, `classes` should be 1000')
-
-  if K.image_data_format() != 'channels_last':
-    logging.warning(
-        'The Xception model is only available for the '
-        'input data format "channels_last" '
-        '(width, height, channels). '
-        'However your settings specify the default '
-        'data format "channels_first" (channels, width, height). '
-        'You should set `image_data_format="channels_last"` in your Keras '
-        'config located at ~/.keras/keras.json. '
-        'The model being returned right now will expect inputs '
-        'to follow the "channels_last" data format.')
-    K.set_image_data_format('channels_last')
-    old_data_format = 'channels_first'
-  else:
-    old_data_format = None
-
-  # Determine proper input shape
-  input_shape = _obtain_input_shape(
-      input_shape,
-      default_size=299,
-      min_size=71,
-      data_format=K.image_data_format(),
-      require_flatten=False,
-      weights=weights)
-
-  if input_tensor is None:
-    img_input = Input(shape=input_shape)
-  else:
-    if not K.is_keras_tensor(input_tensor):
-      img_input = Input(tensor=input_tensor, shape=input_shape)
-    else:
-      img_input = input_tensor
-
-  x = Conv2D(
-      32, (3, 3), strides=(2, 2), use_bias=False, name='block1_conv1')(
-          img_input)
-  x = BatchNormalization(name='block1_conv1_bn')(x)
-  x = Activation('relu', name='block1_conv1_act')(x)
-  x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
-  x = BatchNormalization(name='block1_conv2_bn')(x)
-  x = Activation('relu', name='block1_conv2_act')(x)
-
-  residual = Conv2D(
-      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
-          x)
-  residual = BatchNormalization()(residual)
-
-  x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(
-          x)
-  x = BatchNormalization(name='block2_sepconv1_bn')(x)
-  x = Activation('relu', name='block2_sepconv2_act')(x)
-  x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(
-          x)
-  x = BatchNormalization(name='block2_sepconv2_bn')(x)
-
-  x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(
-          x)
-  x = layers.add([x, residual])
-
-  residual = Conv2D(
-      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
-          x)
-  residual = BatchNormalization()(residual)
-
-  x = Activation('relu', name='block3_sepconv1_act')(x)
-  x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(
-          x)
-  x = BatchNormalization(name='block3_sepconv1_bn')(x)
-  x = Activation('relu', name='block3_sepconv2_act')(x)
-  x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(
-          x)
-  x = BatchNormalization(name='block3_sepconv2_bn')(x)
-
-  x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(
-          x)
-  x = layers.add([x, residual])
-
-  residual = Conv2D(
-      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
-          x)
-  residual = BatchNormalization()(residual)
-
-  x = Activation('relu', name='block4_sepconv1_act')(x)
-  x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(
-          x)
-  x = BatchNormalization(name='block4_sepconv1_bn')(x)
-  x = Activation('relu', name='block4_sepconv2_act')(x)
-  x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(
-          x)
-  x = BatchNormalization(name='block4_sepconv2_bn')(x)
-
-  x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(
-          x)
-  x = layers.add([x, residual])
-
-  for i in range(8):
-    residual = x
-    prefix = 'block' + str(i + 5)
-
-    x = Activation('relu', name=prefix + '_sepconv1_act')(x)
-    x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv1')(
-            x)
-    x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
-    x = Activation('relu', name=prefix + '_sepconv2_act')(x)
-    x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv2')(
-            x)
-    x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
-    x = Activation('relu', name=prefix + '_sepconv3_act')(x)
-    x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv3')(
-            x)
-    x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
-
-    x = layers.add([x, residual])
-
-  residual = Conv2D(
-      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
-          x)
-  residual = BatchNormalization()(residual)
-
-  x = Activation('relu', name='block13_sepconv1_act')(x)
-  x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(
-          x)
-  x = BatchNormalization(name='block13_sepconv1_bn')(x)
-  x = Activation('relu', name='block13_sepconv2_act')(x)
-  x = SeparableConv2D(
-      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(
-          x)
-  x = BatchNormalization(name='block13_sepconv2_bn')(x)
-
-  x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(
-          x)
-  x = layers.add([x, residual])
-
-  x = SeparableConv2D(
-      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(
-          x)
-  x = BatchNormalization(name='block14_sepconv1_bn')(x)
-  x = Activation('relu', name='block14_sepconv1_act')(x)
-
-  x = SeparableConv2D(
-      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(
-          x)
-  x = BatchNormalization(name='block14_sepconv2_bn')(x)
-  x = Activation('relu', name='block14_sepconv2_act')(x)
-
-  if include_top:
-    x = GlobalAveragePooling2D(name='avg_pool')(x)
-    x = Dense(classes, activation='softmax', name='predictions')(x)
-  else:
-    if pooling == 'avg':
-      x = GlobalAveragePooling2D()(x)
-    elif pooling == 'max':
-      x = GlobalMaxPooling2D()(x)
-
-  # Ensure that the model takes into account
-  # any potential predecessors of `input_tensor`.
-  if input_tensor is not None:
-    inputs = layer_utils.get_source_inputs(input_tensor)
-  else:
-    inputs = img_input
-  # Create model.
-  model = Model(inputs, x, name='xception')
-
-  # load weights
-  if weights == 'imagenet':
-    if include_top:
-      weights_path = get_file(
-          'xception_weights_tf_dim_ordering_tf_kernels.h5',
-          TF_WEIGHTS_PATH,
-          cache_subdir='models',
-          file_hash='0a58e3b7378bc2990ea3b43d5981f1f6')
-    else:
-      weights_path = get_file(
-          'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
-          TF_WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models',
-          file_hash='b0042744bf5b25fce3cb969f33bebb97')
-    model.load_weights(weights_path)
-  elif weights is not None:
-    model.load_weights(weights)
-
-  if old_data_format:
-    K.set_image_data_format(old_data_format)
-  return model
-
-
-@tf_export('keras.applications.xception.preprocess_input')
-def preprocess_input(x):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments:
-      x: a 4D numpy array consists of RGB values within [0, 255].
-
-  Returns:
-      Preprocessed array.
-  """
-  return imagenet_utils.preprocess_input(x, mode='tf')
+tf_export('keras.applications.xception.Xception',
+          'keras.applications.Xception')(Xception)
+tf_export('keras.applications.xception.preprocess_input')(preprocess_input)
diff --git a/tensorflow/python/keras/applications/xception_test.py b/tensorflow/python/keras/applications/xception_test.py
deleted file mode 100644
index 7e2efd0..0000000
--- a/tensorflow/python/keras/applications/xception_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Xception application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.platform import test
-
-
-class XceptionTest(test.TestCase):
-
-  def test_with_top(self):
-    model = keras.applications.Xception(weights=None)
-    self.assertEqual(model.output_shape, (None, 1000))
-
-  def test_no_top(self):
-    model = keras.applications.Xception(weights=None, include_top=False)
-    self.assertEqual(model.output_shape, (None, None, None, 2048))
-
-  def test_with_pooling(self):
-    model = keras.applications.Xception(weights=None,
-                                        include_top=False,
-                                        pooling='avg')
-    self.assertEqual(model.output_shape, (None, 2048))
-
-  def test_weight_loading(self):
-    with self.assertRaises(ValueError):
-      keras.applications.Xception(weights='unknown',
-                                  include_top=False)
-    with self.assertRaises(ValueError):
-      keras.applications.Xception(weights='imagenet',
-                                  classes=2000)
-
-  def test_preprocess_input(self):
-    x = np.random.uniform(0, 255, (2, 300, 200, 3))
-    out1 = keras.applications.xception.preprocess_input(x)
-    self.assertAllClose(np.mean(out1), 0., atol=0.1)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 070d411..befe82f 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -22,6 +22,7 @@
 from collections import deque
 from collections import Iterable
 from collections import OrderedDict
+import copy
 import csv
 import json
 import math
@@ -31,10 +32,12 @@
 import numpy as np
 import six
 
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.training_utils import standardize_input_data
+from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -52,6 +55,110 @@
   requests = None
 
 
+def configure_callbacks(callbacks,
+                        model,
+                        do_validation=False,
+                        val_inputs=None,
+                        val_targets=None,
+                        val_sample_weights=None,
+                        batch_size=None,
+                        epochs=None,
+                        steps_per_epoch=None,
+                        samples=None,
+                        validation_steps=None,
+                        verbose=1,
+                        count_mode='steps'):
+  """Configures callbacks for use in various training loops.
+
+  Arguments:
+      callbacks: List of Callbacks.
+      model: Model being trained.
+      do_validation: Whether or not validation loop will be run.
+      val_inputs: Inputs to Model for validation loop. Can be any
+        data format Keras accepts.
+      val_targets: Targets for Model for validation loop. Can be any
+        data format Keras accepts.
+      val_sample_weights: Sample weights for Model for validation loop.
+        Can be any data format Keras accepts.
+      batch_size: Number of samples per batch.
+      epochs: Number of epoch to train.
+      steps_per_epoch: Number of batches to run per training epoch.
+      samples: Number of training samples.
+      validation_steps: Number of batches to run per validation epoch.
+      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+      count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
+
+  Returns:
+      Instance of CallbackList used to control all Callbacks.
+  """
+
+  # Add additional callbacks
+  model.history = History()
+  stateful_metric_names = None
+  if hasattr(model, 'stateful_metric_names'):
+    stateful_metric_names = model.stateful_metric_names
+  callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
+              ] + (callbacks or []) + [model.history]
+  if verbose:
+    callbacks.append(
+        ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+  callback_list = CallbackList(callbacks)
+
+  # Set callback model
+  callback_model = model._get_callback_model()  # pylint: disable=protected-access
+  if do_validation and val_inputs and not context.executing_eagerly():
+    # Need to create the test_function before start of the first epoch
+    # because TensorBoard callback on_epoch_begin adds summary to the
+    # list of fetches of the test_function
+    callback_model._make_test_function()  # pylint: disable=protected-access
+  callback_list.set_model(callback_model)
+
+  # Set callback parameters
+  callback_metrics = []
+  # When we have deferred build scenario with iterator input, we will compile
+  # when we standardize first batch of data.
+  if model._is_compiled:  # pylint: disable=protected-access
+    callback_metrics = copy.copy(model.metrics_names)
+    if do_validation:
+      callback_metrics += ['val_' + n for n in model.metrics_names]
+  if validation_steps is None and isinstance(val_inputs, Sequence):
+    validation_steps = len(val_inputs)
+  callback_params = {
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics,
+      'validation_steps': validation_steps
+  }
+  callback_list.set_params(callback_params)
+
+  # Pass validation data to callbacks
+  if not val_inputs:
+    val_data = []
+  elif _is_generator_like(val_inputs):
+    val_data = val_inputs
+  else:
+    val_data = val_inputs + val_targets
+    if val_sample_weights:
+      val_data += val_sample_weights
+    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
+      val_data += [0.]
+  for cbk in callbacks:
+    cbk.validation_data = val_data
+
+  callback_list.model.stop_training = False
+  return callback_list
+
+
+def _is_generator_like(data):
+  """Checks if data is a generator, Sequence, or Iterator."""
+  return (hasattr(data, 'next') or hasattr(data, '__next__') or isinstance(
+      data, (Sequence, iterator_ops.Iterator, iterator_ops.EagerIterator)))
+
+
 class CallbackList(object):
   """Container abstracting a list of callbacks.
 
@@ -65,15 +172,19 @@
     callbacks = callbacks or []
     self.callbacks = [c for c in callbacks]
     self.queue_length = queue_length
+    self.params = {}
+    self.model = None
 
   def append(self, callback):
     self.callbacks.append(callback)
 
   def set_params(self, params):
+    self.params = params
     for callback in self.callbacks:
       callback.set_params(params)
 
   def set_model(self, model):
+    self.model = model
     for callback in self.callbacks:
       callback.set_model(model)
 
@@ -722,7 +833,7 @@
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
 
-  @compatbility(eager)
+  @compatibility(eager)
   Using `Tensorboard` callback will work while eager execution is enabled,
   however outputting histogram summaries of weights and gradients is not
   supported, and thus `histogram_freq` will be ignored.
@@ -939,7 +1050,7 @@
     """Checks if histogram summaries can be run."""
     # will never be set when in eager
     if self.histogram_freq:
-      if 'validation_steps' in self.params:
+      if self.params.get('validation_steps', None) is not None:
         self._validation_batches = self.params['validation_steps']
       elif self.validation_data:
         self._validation_batches = math.ceil(
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index bd088a5..e84e023 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -728,6 +728,8 @@
           verbose=0)
 
       # fit generator without validation data
+      # histogram_freq must be zero
+      tsb.histogram_freq = 0
       model.fit_generator(
           data_generator(True),
           len(x_train),
@@ -736,6 +738,7 @@
           verbose=0)
 
       # fit generator with validation data and accuracy
+      tsb.histogram_freq = 1
       model.fit_generator(
           data_generator(True),
           len(x_train),
@@ -745,6 +748,7 @@
           verbose=0)
 
       # fit generator without validation data and accuracy
+      tsb.histogram_freq = 0
       model.fit_generator(
           data_generator(True), len(x_train), epochs=2, callbacks=cbks)
       assert os.path.exists(temp_dir)
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 33ad155..d6d3db2 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -500,13 +500,13 @@
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
-        @{tf.VariableSynchronization}. By default the synchronization is set to
+        `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize. If `synchronization` is set to `ON_READ`,
         `trainable` must not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
-        @{tf.VariableAggregation}.
+        `tf.VariableAggregation`.
       getter: Variable getter argument to be passed to the `Checkpointable` API.
 
     Returns:
@@ -1921,13 +1921,13 @@
     use_resource: Whether to use a `ResourceVariable`.
     synchronization: Indicates when a distributed a variable will be
       aggregated. Accepted values are constants defined in the class
-      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `tf.VariableSynchronization`. By default the synchronization is set to
       `AUTO` and the current `DistributionStrategy` chooses
       when to synchronize. If `synchronization` is set to `ON_READ`,
       `trainable` must not be set to `True`.
     aggregation: Indicates how a distributed variable will be aggregated.
       Accepted values are constants defined in the class
-      @{tf.VariableAggregation}.
+      `tf.VariableAggregation`.
     partitioner: Not handled at this time.
 
   Returns:
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index c78e6fe..fcb0733 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -184,14 +184,16 @@
   """Validate all the components of a DistributedValue Dataset input.
 
   Args:
-    distribution_strategy: The current DistributionStrategy using to call
+    distribution_strategy: The current DistributionStrategy used to call
         `fit`/`evaluate`.
     x: Input Dataset DistributedValue object. For example, when we use
         `MirroredStrategy` this is a PerDevice object with a tensor for each
-        device set in the dict.
+        device set in the dict. x can also be a tuple or dict. The keys of the
+        dict should match the names of the input layers of the model.
     y: Target Dataset DistributedValue object. For example, when we use
         `MirroredStrategy` this is a PerDevice object with a tensor for each
-        device set in the dict.
+        device set in the dict. y can also be a tuple or dict. The keys of the
+        dict should match the names of the output layers of the model.
 
   Returns:
     The unwrapped values list of the x and y DistributedValues inputs.
@@ -206,30 +208,50 @@
   # and targets to a model should be from a `tf.data.Dataset`.
 
   # If each element of x and y are not tensors, we cannot standardize and
-  # validate the input and targets.`
-  if not tensor_util.is_tensor(x):
-    raise ValueError('Dataset input to the model should be tensors instead they'
-                     ' are of type {}'.format(type(x)))
+  # validate the input and targets.
+  x_values_list = validate_per_device_inputs(distribution_strategy, x)
 
-  if not tensor_util.is_tensor(y):
-    raise ValueError('Dataset input to the model should be tensors instead they'
-                     ' are of type {}'.format(type(y)))
-
-  # At this point both x and y contain tensors in the `DistributedValues`
-  # structure.
-  x_values = distribution_strategy.unwrap(x)
-  y_values = distribution_strategy.unwrap(y)
-
-  # Validate that the shape and dtype of all the elements in x are the same.
-  validate_all_tensor_shapes(x, x_values)
-  validate_all_tensor_types(x, x_values)
-
-  # Similarly for y, we perform the same validation
-  validate_all_tensor_shapes(y, y_values)
-  validate_all_tensor_types(y, y_values)
+  y_values_list = validate_per_device_inputs(distribution_strategy, y)
 
   # Return the unwrapped values to avoid calling `unwrap` a second time.
-  return x_values, y_values
+  return x_values_list, y_values_list
+
+
+def validate_per_device_inputs(distribution_strategy, x):
+  """Validates PerDevice dataset input list.
+
+  Args:
+    distribution_strategy: The current DistributionStrategy used to call
+      `fit`, `evaluate` and `predict`.
+    x: A list of PerDevice objects that represent the input or
+      target values.
+
+  Returns:
+    List containing the first element of each of the PerDevice objects in
+    the input list.
+
+  Raises:
+    ValueError: If any of the objects in the `per_device_list` is not a tensor.
+
+  """
+  # Convert the inputs and targets into a list of PerDevice objects.
+  per_device_list = nest.flatten(x)
+  x_values_list = []
+  for x in per_device_list:
+    if not tensor_util.is_tensor(x):
+      raise ValueError('Dataset input to the model should be tensors instead '
+                       'they are of type {}'.format(type(x)))
+
+    # At this point both x and y contain tensors in the `DistributedValues`
+    # structure.
+    x_values = distribution_strategy.unwrap(x)
+
+    # Validate that the shape and dtype of all the elements in x are the same.
+    validate_all_tensor_shapes(x, x_values)
+    validate_all_tensor_types(x, x_values)
+
+    x_values_list.append(x_values[0])
+  return x_values_list
 
 
 def validate_all_tensor_types(x, x_values):
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 8f35794..708fa1c 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -43,6 +43,7 @@
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
@@ -116,6 +117,16 @@
     # included in base_init to avoid excessive special casing when retrieving
     # the value).
     self._extra_variables = []
+    # In many internal cases one needs to compute both the model's output
+    # and its output mask without relying on `__call__` (which would do both and
+    # set mask metadata), but for models, computing the mask requires to
+    # recompute the output.
+    # Hence the pattern `output = model.call(); mask = model.compute_mask()`
+    # would be redundant, and internal logic
+    # (susceptible to use `call` directly) should prefer using the
+    # internal method `output, mask = _call_and_compute_mask()`.
+    # This is True for Sequential networks and graph networks.
+    self._compute_output_and_mask_jointly = False
 
     self.supports_masking = False
     if not hasattr(self, 'optimizer'):
@@ -219,6 +230,7 @@
     # A Network does not create weights of its own, thus it is already
     # built.
     self.built = True
+    self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
 
     self._input_layers = []
@@ -819,6 +831,10 @@
         A tensor if there is a single output, or
         a list of tensors if there are more than one outputs.
     """
+    if not self._is_graph_network:
+      raise NotImplementedError('When subclassing the `Model` class, you should'
+                                ' implement a `call` method.')
+
     inputs = generic_utils.to_list(inputs)
     if mask is None:
       masks = [None for _ in range(len(inputs))]
@@ -1007,7 +1023,8 @@
                 kwargs.setdefault('mask', computed_mask)
 
               # Compute outputs and masks.
-              if isinstance(layer, Network) and layer._is_graph_network:
+              if (isinstance(layer, Network) and
+                  layer._compute_output_and_mask_jointly):
                 output_tensors, output_masks = layer._call_and_compute_mask(
                     computed_tensor, **kwargs)
               else:
@@ -1027,7 +1044,8 @@
                 kwargs.setdefault('mask', computed_masks)
 
               # Compute outputs and masks.
-              if isinstance(layer, Network) and layer._is_graph_network:
+              if (isinstance(layer, Network) and
+                  layer._compute_output_and_mask_jointly):
                 output_tensors, output_masks = layer._call_and_compute_mask(
                     computed_tensors, **kwargs)
               else:
@@ -1438,6 +1456,11 @@
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
       self._checkpointable_saver.save(filepath, session=session)
+      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
+      checkpoint_management.update_checkpoint_state(
+          save_dir=os.path.dirname(filepath),
+          model_checkpoint_path=filepath,
+          all_model_checkpoint_paths=[filepath])
 
   def load_weights(self, filepath, by_name=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index d5ccd44..a2eed7c 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -127,6 +127,7 @@
                 },
                 'loss': model.loss,
                 'metrics': model.metrics,
+                'weighted_metrics': model.weighted_metrics,
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
@@ -246,6 +247,8 @@
       # Recover loss functions and metrics.
       loss = convert_custom_objects(training_config['loss'])
       metrics = convert_custom_objects(training_config['metrics'])
+      weighted_metrics = convert_custom_objects(
+          training_config['weighted_metrics'])
       sample_weight_mode = training_config['sample_weight_mode']
       loss_weights = training_config['loss_weights']
 
@@ -254,6 +257,7 @@
           optimizer=optimizer,
           loss=loss,
           metrics=metrics,
+          weighted_metrics=weighted_metrics,
           loss_weights=loss_weights,
           sample_weight_mode=sample_weight_mode)
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index f2f8a27..b7c2e9c 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -36,6 +36,7 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training as training_module
 
 try:
@@ -337,10 +338,18 @@
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.RepeatVector(3))
       model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
-                    metrics=[keras.metrics.categorical_accuracy],
-                    sample_weight_mode='temporal')
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ],
+          weighted_metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ],
+          sample_weight_mode='temporal')
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -435,9 +444,17 @@
       output = keras.layers.Dense(3)(x)
 
       model = keras.models.Model(inputs, output)
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
-                    metrics=[keras.metrics.categorical_accuracy])
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ],
+          weighted_metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ])
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
@@ -623,9 +640,13 @@
       outputs = keras.layers.Dense(3)(x)
 
       model = keras.Model(inputs, outputs)
-      model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.Adam(),
-                    metrics=[keras.metrics.categorical_accuracy])
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.Adam(),
+          metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ])
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
@@ -744,7 +765,7 @@
       model.compile(
           loss='mse',
           optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
+          metrics=['acc', keras.metrics.CategoricalAccuracy()])
       temp_dir = self.get_temp_dir()
       prefix = os.path.join(temp_dir, 'ckpt')
       train_x = np.random.random((3, 2))
@@ -781,7 +802,7 @@
       load_model.compile(
           loss='mse',
           optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
+          metrics=['acc', keras.metrics.CategoricalAccuracy()])
       load_model.train_on_batch(train_x, train_y)
       self.assertAllClose(ref_y_after_train, self.evaluate(load_model(x)))
 
@@ -813,6 +834,9 @@
         session.run([v.initializer for v in model.variables])
       ref_y = self.evaluate(ref_y_tensor)
       model.save_weights(prefix)
+      self.assertEqual(
+          prefix,
+          checkpoint_management.latest_checkpoint(temp_dir))
       for v in model.variables:
         self.evaluate(
             v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 41cdfda..415b15f 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -21,15 +21,18 @@
 
 import copy
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -92,8 +95,12 @@
   ```
   """
 
+  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, layers=None, name=None):
     super(Sequential, self).__init__(name=name)
+    self.supports_masking = True
+    self._build_input_shape = None
+    self._compute_output_and_mask_jointly = True
 
     # Add to the model any layers passed to the constructor.
     if layers:
@@ -105,9 +112,12 @@
     # Historically, `sequential.layers` only returns layers that were added
     # via `add`, and omits the auto-generated `InputLayer` that comes at the
     # bottom of the stack.
-    if self._layers and isinstance(self._layers[0], InputLayer):
-      return self._layers[1:]
-    return self._layers
+    # `CheckpointableBase` manages the `_layers` attributes and does filtering
+    # over it.
+    layers = super(Sequential, self).layers
+    if layers and isinstance(layers[0], InputLayer):
+      return layers[1:]
+    return layers[:]
 
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
@@ -129,30 +139,16 @@
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
     self.built = False
+    set_inputs = False
     if not self._layers:
-      set_inputs = False
-      # First layer in model: check that it is an input layer.
-      if not isinstance(layer, InputLayer):
-        # Create an input tensor and call `layer` on the input tensor.
-        # First, we need to infer the expected input shape and dtype.
-        first_layer = layer
-        if isinstance(layer, (Model, Sequential)):
-          # We were passed a model as first layer.
-          # This requires a specific way to figure out the
-          # input shape and dtype.
-          if not layer.layers:
-            raise ValueError('Cannot add an empty model '
-                             'to a `Sequential` model.')
-          # In case of nested models: recover the first layer
-          # of the deepest model to infer input shape and dtype.
-          first_layer = layer.layers[0]
-          while isinstance(first_layer, (Model, Sequential)):
-            first_layer = first_layer.layers[0]
-
-        if hasattr(first_layer, '_batch_input_shape'):
-          batch_shape = first_layer._batch_input_shape
-          dtype = first_layer.dtype
-          # Instantiate the input layer.
+      if isinstance(layer, InputLayer):
+        # Corner case where the user passes an InputLayer layer via `add`.
+        assert len(layer._inbound_nodes[-1].output_tensors) == 1
+        set_inputs = True
+      else:
+        batch_shape, dtype = get_input_shape_and_dtype(layer)
+        if batch_shape:
+          # Instantiate an input layer.
           x = Input(
               batch_shape=batch_shape,
               dtype=dtype,
@@ -162,25 +158,20 @@
           # to the input layer we just created.
           layer(x)
           set_inputs = True
-        else:
-          # The layer doesn't know about its expected shape. We will have to
-          # build the model lazily on `fit`/etc.
-          batch_shape = None
-      else:
-        # Corner case where the user passes an InputLayer layer via `add`.
-        assert len(layer._inbound_nodes[-1].output_tensors) == 1
-        set_inputs = True
 
       if set_inputs:
+        # If an input layer (placeholder) is available.
         if len(layer._inbound_nodes[-1].output_tensors) != 1:
           raise ValueError('All layers in a Sequential model '
                            'should have a single output tensor. '
                            'For multi-output layers, '
                            'use the functional API.')
-
         self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
         self.inputs = layer_utils.get_source_inputs(self.outputs[0])
+
     elif self.outputs:
+      # If the model is being built continuously on top of an input layer:
+      # refresh its output.
       output_tensor = layer(self.outputs[0])
       if isinstance(output_tensor, list):
         raise TypeError('All layers in a Sequential model '
@@ -188,10 +179,13 @@
                         'For multi-output layers, '
                         'use the functional API.')
       self.outputs = [output_tensor]
-    if self.inputs:
-      self.build()
+    if set_inputs or self._is_graph_network:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self.built = True
     else:
       self._layers.append(layer)
+    if self._layers:
+      self._track_layers(self._layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def pop(self):
@@ -204,54 +198,73 @@
       raise TypeError('There are no layers in the model.')
 
     self._layers.pop()
-    self.built = False
     if not self.layers:
       self.outputs = None
       self.inputs = None
-    elif self.outputs:
+      self.built = False
+    elif self._is_graph_network:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      self.build()
-
-  def build(self, input_shape=None):
-    self._set_inputs_and_outputs(input_shape=input_shape)
-
-  def symbolic_set_inputs(self, inputs):
-    self._set_inputs_and_outputs(tensor=inputs)
-
-  @checkpointable.no_automatic_dependency_tracking
-  def _set_inputs_and_outputs(self, input_shape=None, tensor=None):
-    """Set model's input and output specs based on the input received.
-
-    If `tensor` is provided, `input_shape` is not required.
-
-    Args:
-      input_shape: Optional shape of input.
-      tensor: Optional existing tensor to wrap into the `Input` layer.
-    """
-    if not self.inputs:
-      dtype = K.floatx()
-      if tensor is not None:
-        batch_shape = (None,) + tuple(tensor.get_shape().as_list()[1:])
-        x = Input(dtype=dtype, name=self.name + '_input', tensor=tensor)
-      elif input_shape is not None:
-        batch_shape = tuple(input_shape)
-        x = Input(
-            batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
-      self.inputs = [x]
-      for layer in self._layers:
-        x = layer(x)
-      self.outputs = [x]
-      # Make sure that the model's input shape will be preserved during
-      # serialization.
-      if self._layers:
-        self._layers[0]._batch_input_shape = batch_shape
-
-    if self.inputs:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
-    if self._layers:
-      self._track_layers(self._layers)
+
+  def build(self, input_shape=None):
+    if self._is_graph_network:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+    else:
+      if input_shape is None:
+        raise ValueError('You must provide an `input_shape` argument.')
+      self._build_input_shape = input_shape
+      shape = input_shape
+      for layer in self.layers:
+        if not layer.built:
+          with ops.name_scope(layer._name_scope()):
+            layer.build(shape)
+          layer.built = True
+        shape = layer.compute_output_shape(shape)
+    self.built = True
+
+  def call(self, inputs, training=None, mask=None):
+    if self._is_graph_network:
+      return super(Sequential, self).call(inputs, training=training, mask=mask)
+
+    outputs, _ = self._call_and_compute_mask(
+        inputs, training=training, mask=mask)
+    return outputs
+
+  def _call_and_compute_mask(self, inputs, training=None, mask=None):
+    if not self.built:
+      self.build(inputs.shape)
+
+    x = inputs
+    for layer in self.layers:
+      kwargs = {}
+      if 'mask' in tf_inspect.getargspec(layer.call).args:
+        kwargs['mask'] = mask
+      if 'training' in tf_inspect.getargspec(layer.call).args:
+        kwargs['training'] = training
+
+      if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
+        x, mask = layer._call_and_compute_mask(x, **kwargs)
+      else:
+        x = layer.call(x, **kwargs)
+        if layer.supports_masking:
+          mask = layer.compute_mask(x, mask)
+        else:
+          mask = None
+      if not context.executing_eagerly():
+        x._keras_mask = mask
+    return x, mask
+
+  def compute_output_shape(self, input_shape):
+    shape = input_shape
+    for layer in self.layers:
+      shape = layer.compute_output_shape(shape)
+    return shape
+
+  def compute_mask(self, inputs, mask):
+    _, mask = self._call_and_compute_mask(inputs, mask=mask)
+    return mask
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
@@ -296,18 +309,69 @@
       return (proba > 0.5).astype('int32')
 
   def get_config(self):
-    config = []
+    layer_configs = []
     for layer in self.layers:
-      config.append({
+      layer_configs.append({
           'class_name': layer.__class__.__name__,
           'config': layer.get_config()
       })
-    return copy.deepcopy(config)
+    config = {
+        'name': self.name,
+        'layers': copy.deepcopy(layer_configs)
+    }
+    if self._build_input_shape:
+      config['build_input_shape'] = self._build_input_shape
+    return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    model = cls()
-    for conf in config:
-      layer = layer_module.deserialize(conf, custom_objects=custom_objects)
+    if 'name' in config:
+      name = config['name']
+      build_input_shape = config.get('build_input_shape')
+      layer_configs = config['layers']
+    else:
+      name = None
+      build_input_shape = None
+    model = cls(name=name)
+    for layer_config in layer_configs:
+      layer = layer_module.deserialize(layer_config,
+                                       custom_objects=custom_objects)
       model.add(layer)
+    if not model.inputs and build_input_shape:
+      model.build(build_input_shape)
     return model
+
+
+def get_input_shape_and_dtype(layer):
+  """Retrieve input shape and input dtype of layer if applicable.
+
+  Args:
+    layer: Layer (or model) instance.
+
+  Returns:
+    Tuple (input_shape, input_dtype). Both could be None if the layer
+      does not have a defined input shape.
+
+  Raises:
+    ValueError: in case an empty Sequential or Graph Network is passed.
+  """
+  if ((isinstance(layer, Model) and layer._is_graph_network)
+      or isinstance(layer, Sequential)):
+    # We were passed a model as first layer.
+    # This requires a specific way to figure out the
+    # input shape and dtype.
+    if not layer.layers:
+      raise ValueError('Cannot add an empty model '
+                       'to a `Sequential` model.')
+    # In case of nested models: recover the first layer
+    # of the deepest model to infer input shape and dtype.
+    layer = layer.layers[0]
+    while ((isinstance(layer, Model) and layer._is_graph_network)
+           or isinstance(layer, Sequential)):
+      layer = layer.layers[0]
+
+  if hasattr(layer, '_batch_input_shape'):
+    batch_shape = layer._batch_input_shape
+    dtype = layer.dtype
+    return batch_shape, dtype
+  return None, None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 4f4adca..3f8e120 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -18,17 +18,30 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
 
-class TestSequential(test.TestCase):
+def _get_small_mlp(num_hidden, num_classes, input_dim=None):
+  model = keras.models.Sequential()
+  if input_dim:
+    model.add(keras.layers.Dense(num_hidden, activation='relu',
+                                 input_dim=input_dim))
+  else:
+    model.add(keras.layers.Dense(num_hidden, activation='relu'))
+  model.add(keras.layers.Dense(num_classes, activation='softmax'))
+  return model
+
+
+class TestSequential(test.TestCase, parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
 
@@ -50,9 +63,7 @@
     batch_size = 5
     num_classes = 2
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(num_hidden, input_dim=input_dim))
-    model.add(keras.layers.Dense(num_classes))
+    model = _get_small_mlp(num_hidden, num_classes, input_dim)
     model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
@@ -83,11 +94,11 @@
     batch_size = 5
     num_classes = 2
 
-    model = keras.models.Sequential()
-    # We don't specify the input shape.
-    model.add(keras.layers.Dense(num_hidden))
-    model.add(keras.layers.Dense(num_classes))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model = _get_small_mlp(num_hidden, num_classes)
+    model.compile(
+        loss='mse',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=[keras.metrics.CategoricalAccuracy()])
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -96,9 +107,7 @@
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     self.assertTrue(model.built)
-    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
-    self.assertEqual(model.outputs[0].get_shape().as_list(),
-                     [None, num_classes])
+    self.assertFalse(model._is_graph_network)
     self.assertEqual(len(model.weights), 2 * 2)
 
   @tf_test_util.run_in_graph_and_eager_modes
@@ -109,11 +118,11 @@
     num_samples = 50
     steps_per_epoch = 10
 
-    model = keras.models.Sequential()
-    # We don't specify the input shape.
-    model.add(keras.layers.Dense(num_hidden))
-    model.add(keras.layers.Dense(num_classes))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model = _get_small_mlp(num_hidden, num_classes)
+    model.compile(
+        loss='mse',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=[keras.metrics.CategoricalAccuracy()])
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -127,19 +136,18 @@
 
     model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
-    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
-    self.assertEqual(model.outputs[0].get_shape().as_list(),
-                     [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
+    self.assertFalse(model._is_graph_network)
 
-  def test_training_and_eval_methods_on_symbolic_tensors(self):
+  @parameterized.parameters((True,), (False,))
+  def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
     with self.test_session():
 
-      def create_model():
-        model = keras.Sequential()
-        model.add(keras.layers.Dense(10, activation='relu'))
-        model.add(keras.layers.Dense(4, activation='softmax'))
-
+      def get_model():
+        if deferred:
+          model = _get_small_mlp(10, 4)
+        else:
+          model = _get_small_mlp(10, 4, input_dim=3)
         model.compile(
             optimizer=rmsprop.RMSPropOptimizer(1e-3),
             loss='categorical_crossentropy',
@@ -149,22 +157,22 @@
       inputs = keras.backend.zeros(shape=(10, 3))
       targets = keras.backend.zeros(shape=(10, 4))
 
-      model = create_model()
+      model = get_model()
       model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
 
-      model = create_model()
+      model = get_model()
       model.evaluate(inputs, targets, steps=2, verbose=0)
 
-      model = create_model()
+      model = get_model()
       model.predict(inputs, steps=2)
 
-      model = create_model()
+      model = get_model()
       model.train_on_batch(inputs, targets)
 
-      model = create_model()
+      model = get_model()
       model.test_on_batch(inputs, targets)
 
-      model = create_model()
+      model = get_model()
       model.fit(
           inputs,
           targets,
@@ -247,17 +255,18 @@
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_deferred_build_serialization(self):
     num_hidden = 5
     input_dim = 3
     batch_size = 5
     num_classes = 2
 
-    model = keras.models.Sequential()
-    # We don't specify the input shape.
-    model.add(keras.layers.Dense(num_hidden))
-    model.add(keras.layers.Dense(num_classes))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model = _get_small_mlp(num_hidden, num_classes)
+    model.compile(
+        loss='mse',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=[keras.metrics.CategoricalAccuracy()])
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -266,11 +275,93 @@
     self.assertTrue(model.built)
 
     config = model.get_config()
+    self.assertIn('build_input_shape', config)
+
     new_model = keras.models.Sequential.from_config(config)
     self.assertTrue(new_model.built)
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 4)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_sequential_shape_inference_deferred(self):
+    model = _get_small_mlp(4, 5)
+    output_shape = model.compute_output_shape((None, 7))
+    self.assertEqual(tuple(output_shape.as_list()), (None, 5))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_sequential_build_deferred(self):
+    model = _get_small_mlp(4, 5)
+
+    model.build((None, 10))
+    self.assertTrue(model.built)
+    self.assertEqual(len(model.weights), 4)
+
+    # Test with nested model
+    model = _get_small_mlp(4, 3)
+    inner_model = _get_small_mlp(4, 5)
+    model.add(inner_model)
+
+    model.build((None, 10))
+    self.assertTrue(model.built)
+    self.assertTrue(model.layers[-1].built)
+    self.assertEqual(len(model.weights), 8)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_sequential_nesting(self):
+    model = _get_small_mlp(4, 3)
+    inner_model = _get_small_mlp(4, 5)
+    model.add(inner_model)
+
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    x = np.random.random((2, 6))
+    y = np.random.random((2, 5))
+    model.fit(x, y, epochs=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_variable_names(self):
+    model = keras.models.Sequential([keras.layers.Dense(3)])
+    model.add(keras.layers.Dense(2))
+    model(array_ops.ones([2, 4]))
+    self.assertEqual(
+        ['sequential/dense/kernel:0', 'sequential/dense/bias:0',
+         'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
+        [v.name for v in model.variables])
+
+
+class TestSequentialEagerIntegration(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_defun_on_call(self):
+    # Check that one can subclass Sequential and place the `call` in a `defun`.
+
+    class MySequential(keras.Sequential):
+
+      def __init__(self, name=None):
+        super(MySequential, self).__init__(name=name)
+        self.call = function.defun(self.call)
+
+    model = MySequential()
+    model.add(keras.layers.Dense(4, activation='relu'))
+    model.add(keras.layers.Dense(5, activation='softmax'))
+
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+
+    x = np.random.random((2, 6))
+    y = np.random.random((2, 5))
+    model.fit(x, y, epochs=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_build_before_fit(self):
+    # Fix for b/112433577
+    model = _get_small_mlp(4, 5)
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+
+    model.build((None, 6))
+
+    x = np.random.random((2, 6))
+    y = np.random.random((2, 5))
+    model.fit(x, y, epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 2cdd00a..f71388c 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -29,6 +29,7 @@
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import distributed_training_utils
@@ -39,6 +40,8 @@
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -74,6 +77,7 @@
   class MyModel(tf.keras.Model):
 
     def __init__(self):
+      super(MyModel, self).__init__()
       self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
       self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
 
@@ -94,6 +98,7 @@
   class MyModel(tf.keras.Model):
 
     def __init__(self):
+      super(MyModel, self).__init__()
       self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
       self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
       self.dropout = tf.keras.layers.Dropout(0.5)
@@ -136,6 +141,167 @@
         if i not in skip_target_weighing_indices
     ]
 
+  def _get_metric_name(self, metric, output_index, weighted=False):
+    """Returns the metric name corresponding to the given metric input.
+
+    Arguments:
+        metric: Metric function name or reference.
+      output_index: Index of the current output.
+        weighted: Boolean indicating if the given metric is weighted.
+
+    Returns:
+        A metric name.
+    """
+    metric_name_prefix = 'weighted_' if weighted else ''
+    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+      if metric in ('accuracy', 'acc'):
+        suffix = 'acc'
+      elif metric in ('crossentropy', 'ce'):
+        suffix = 'ce'
+    else:
+      metric_fn = metrics_module.get(metric)
+      # Get metric name as string
+      if hasattr(metric_fn, 'name'):
+        suffix = metric_fn.name
+      else:
+        suffix = metric_fn.__name__
+    metric_name = metric_name_prefix + suffix
+
+    if len(self.output_names) > 1:
+      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
+    j = 1
+    base_metric_name = metric_name
+    while metric_name in self.metrics_names:
+      metric_name = '%s_%d' % (base_metric_name, j)
+      j += 1
+
+    return metric_name
+
+  def _handle_per_output_metrics(self,
+                                 metrics,
+                                 y_true,
+                                 y_pred,
+                                 output_index,
+                                 output_shape,
+                                 loss_fn,
+                                 mask,
+                                 weights=None):
+    """Calls metric functions and sets metric attributes for a single output.
+
+    Arguments:
+      metrics: List of metrics.
+      y_true: Target output.
+      y_pred: Predicted output.
+      output_index: Index of the current output.
+      output_shape: Shape of the current output.
+      loss_fn: Loss function corresponding to the current output.
+      mask: Computed mask value for the current output.
+      weights: Weights to be applied on the current output.
+
+    Returns:
+      A list of metric result tensors.
+    """
+    metric_results = []
+    for metric in metrics:
+      metric_fn = training_utils.get_metric_function(
+          metric, output_shape=output_shape, loss_fn=loss_fn)
+      metric_name = self._get_metric_name(
+          metric, output_index, weighted=weights is not None)
+
+      with K.name_scope(metric_name):
+        # If both outputs and targets are available, call the metric function.
+        if y_true is not None and y_pred is not None:
+          if isinstance(metric_fn, metrics_module.Metric):
+            # Call the stateful metric function.
+            if mask is not None:
+              mask = math_ops.cast(mask, y_pred.dtype)
+              # Update weights with mask.
+              if weights is None:
+                weights = mask
+              else:
+                # Update shape of weights if possible before adding mask.
+                # Update dimensions of weights to match with mask if possible.
+                mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
+                    mask, None, weights)
+                try:
+                  # Broadcast weights if possible.
+                  weights = weights_broadcast_ops.broadcast_weights(
+                      weights, mask)
+                except ValueError:
+                  pass
+                  # TODO(psv): Handle case when mask and weight shapes are not
+                  # compatible.
+                weights *= mask
+
+            metric_result = metric_fn(y_true, y_pred, weights)
+          else:
+            # Call the stateless metric function.
+            weighted_metric_fn = training_utils.weighted_masked_objective(
+                metric_fn)
+            metric_result = weighted_metric_fn(
+                y_true, y_pred, weights=weights, mask=mask)
+
+          if not context.executing_eagerly():
+            # Keep track of metric result tensor.
+            self.metrics_tensors.append(metric_result)
+          metric_results.append(metric_result)
+
+      # Keep track of metric name.
+      self.metrics_names.append(metric_name)
+
+      # Keep track of stateful metric attributes (name and metric function).
+      if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
+        self.stateful_metric_names.append(metric_name)
+        self.stateful_metric_functions.append(metric_fn)
+        if not context.executing_eagerly():
+          # Keep track of updates created by stateful metrics.
+          self.metrics_updates += metric_fn.updates
+    return metric_results
+
+  def _handle_metrics(self,
+                      outputs,
+                      skip_target_indices=None,
+                      targets=None,
+                      sample_weights=None,
+                      masks=None):
+    """Handles calling metric functions and setting model metric attributes.
+
+    Arguments:
+      outputs: List of outputs (predictions).
+      skip_target_indices: Optional. List of target ids to skip.
+      targets: List of targets.
+      sample_weights: Optional list of sample weight arrays.
+      masks: List of computed output mask values.
+
+    Returns:
+      A list of metric result tensors.
+    """
+    skip_target_indices = skip_target_indices or []
+    metric_results = []
+    with K.name_scope('metrics'):
+      for i in range(len(outputs)):
+        if i in skip_target_indices:
+          continue
+        output = outputs[i] if outputs else None
+        target = targets[i] if targets else None
+        output_shape = None if output is None else output.get_shape().as_list()
+        output_mask = masks[i] if masks else None
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self.nested_metrics[i], target, output, i, output_shape,
+                self.loss_functions[i], output_mask))
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self.nested_weighted_metrics[i],
+                target,
+                output,
+                i,
+                output_shape,
+                self.loss_functions[i],
+                output_mask,
+                weights=sample_weights[i]))
+    return metric_results
+
   @checkpointable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
@@ -151,9 +317,9 @@
 
     Arguments:
         optimizer: String (name of optimizer) or optimizer instance.
-            See [optimizers](/optimizers).
+            See [optimizers](/api_docs/python/tf/keras/optimizers).
         loss: String (name of objective function) or objective function.
-            See [losses](/losses).
+            See [losses](/api_docs/python/tf/losses).
             If the model has multiple outputs, you can use a different loss
             on each output by passing a dictionary or a list of losses.
             The loss value that will be minimized by the model
@@ -231,8 +397,6 @@
     self.metrics = metrics or []
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
-    if context.executing_eagerly() and weighted_metrics is not None:
-      raise ValueError('weighted_metrics is not supported in Eager mode.')
     self.weighted_metrics = weighted_metrics
     if context.executing_eagerly() and target_tensors is not None:
       raise ValueError('target_tensors is not supported in Eager mode.')
@@ -335,6 +499,20 @@
                       str(loss_weights) + ' - expected a list of dicts.')
     self.loss_weights_list = loss_weights_list
 
+    # Initialize model metric attributes.
+    self.metrics_names = ['loss']
+    self.metrics_tensors = []
+    self.metrics_updates = []
+    self.stateful_metric_names = []
+    self.stateful_metric_functions = []
+
+    # Nested metrics is a list of list of metrics.
+    # One list per output of the model.
+    self.nested_metrics = training_utils.collect_metrics(
+        metrics, self.output_names)
+    self.nested_weighted_metrics = training_utils.collect_metrics(
+        weighted_metrics, self.output_names)
+
     # Initialization for Eager mode execution.
     if context.executing_eagerly():
       # Prepare sample weights.
@@ -345,19 +523,16 @@
         raise ValueError('target_tensors are not currently supported in Eager '
                          'mode.')
       self.total_loss = None
-      self.metrics_tensors = []
-      self.metrics_names = ['loss']
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
           self.metrics_names.append(self.output_names[i] + '_loss')
-      self.nested_metrics = training_utils.collect_metrics(metrics,
-                                                           self.output_names)
-      # TODO(fchollet): support stateful metrics in eager execution.
-      self.stateful_metric_functions = []
-      self.stateful_metric_names = []
 
-      with K.name_scope('metrics'):
-        training_utils.populate_metric_names(self)
+      # Set metric attributes on model.
+      self._handle_metrics(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
+
       self.targets = []
       for i in range(len(self.outputs)):
         self._feed_output_names.append(self.output_names[i])
@@ -420,11 +595,6 @@
     self._set_sample_weight_attributes(sample_weight_mode,
                                        skip_target_weighing_indices)
 
-    # Prepare metrics.
-    self.weighted_metrics = weighted_metrics
-    self.metrics_names = ['loss']
-    self.metrics_tensors = []
-
     # Compute total loss.
     total_loss = None
     with K.name_scope('loss'):
@@ -458,55 +628,13 @@
       for loss_tensor in self.losses:
         total_loss += loss_tensor
 
-    # List of same size as output_names.
-    # contains tuples (metrics for output, names of metrics).
-    nested_metrics = training_utils.collect_metrics(metrics, self.output_names)
-    nested_weighted_metrics = training_utils.collect_metrics(weighted_metrics,
-                                                             self.output_names)
-    self.metrics_updates = []
-    self.stateful_metric_names = []
-    self.stateful_metric_functions = []
-    with K.name_scope('metrics'):
-      for i in range(len(self.outputs)):
-        if i in skip_target_indices:
-          continue
-
-        y_true = self.targets[i]
-        y_pred = self.outputs[i]
-        weights = self.sample_weights[i]
-        output_metrics = nested_metrics[i]
-        output_weighted_metrics = nested_weighted_metrics[i]
-        output_shape = self.outputs[i].get_shape().as_list()
-        loss_fn = self.loss_functions[i]
-
-        def handle_metrics(metrics, output_shape, loss_fn, weights=None):
-          """Invokes metric functions for the output."""
-
-          for metric in metrics:
-            metric_fn = training_utils.get_metric_function(
-                metric, output_shape=output_shape, loss_fn=loss_fn)
-            metric_name = training_utils.get_metric_name(
-                metric, weighted=weights is not None)
-
-            with K.name_scope(metric_name):
-              weighted_metric_fn = training_utils.weighted_masked_objective(
-                  metric_fn)
-              metric_result = weighted_metric_fn(
-                  y_true, y_pred, weights=weights, mask=masks[i])  # pylint: disable=undefined-loop-variable
-
-            metric_name = training_utils.add_metric_name(self, metric_name, i)  # pylint: disable=undefined-loop-variable
-            self.metrics_tensors.append(metric_result)
-
-            # Keep track of state updates created by
-            # stateful metrics (i.e. metrics layers).
-            if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
-              self.stateful_metric_names.append(metric_name)
-              self.stateful_metric_functions.append(metric_fn)
-              self.metrics_updates += metric_fn.updates
-
-        handle_metrics(output_metrics, output_shape, loss_fn)
-        handle_metrics(
-            output_weighted_metrics, output_shape, loss_fn, weights=weights)
+    # Invoke metric functions for all the outputs.
+    self._handle_metrics(
+        self.outputs,
+        masks=masks,
+        targets=self.targets,
+        skip_target_indices=skip_target_indices,
+        sample_weights=self.sample_weights)
 
     # Prepare gradient updates and state updates.
     self.total_loss = total_loss
@@ -717,8 +845,8 @@
       x_values, y_values = distributed_training_utils.\
         validate_distributed_dataset_inputs(self._distribution_strategy, x, y)
 
-    _, _, sample_weights = self._standardize_weights(x_values[0],
-                                                     y_values[0],
+    _, _, sample_weights = self._standardize_weights(x_values,
+                                                     y_values,
                                                      sample_weight,
                                                      class_weight,
                                                      batch_size)
@@ -856,7 +984,7 @@
     all_inputs = []
     is_build_called = False
     is_compile_called = False
-    if not self.built:
+    if not self.inputs:
       # We need to use `x` to set the model inputs.
       # We type-check that `x` and `y` are either single arrays
       # or lists of arrays.
@@ -1067,22 +1195,13 @@
           'in their call() signatures do not yet support shape inference. File '
           'a feature request if this limitation bothers you.')
     if self.__class__.__name__ == 'Sequential':
-      # Note: we can't test whether the model is `Sequential` via `isinstance`
-      # since `Sequential` depends on `Model`.
-      if isinstance(inputs, list):
-        assert len(inputs) == 1
-        inputs = inputs[0]
-
       if tensor_util.is_tensor(inputs):
-        if context.executing_eagerly():
-          input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
-          self.build(input_shape=input_shape)
-        else:
-          self.symbolic_set_inputs(inputs)
+        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+        self.build(input_shape=input_shape)
       else:
         input_shape = (None,) + inputs.shape[1:]
         self.build(input_shape=input_shape)
-    elif context.executing_eagerly():
+    if context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
       self._symbolic_set_inputs(inputs, training=training)
@@ -1273,7 +1392,7 @@
             0 = silent, 1 = progress bar, 2 = one line per epoch.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
-            See [callbacks](/callbacks).
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
         validation_split: Float between 0 and 1.
             Fraction of the training data to be used as validation data.
             The model will set apart this fraction of the training data,
@@ -1891,6 +2010,10 @@
     Raises:
         ValueError: In case the generator yields data in an invalid format.
     """
+    if self._distribution_strategy:
+      raise NotImplementedError('`fit_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
           '`fit_generator` is not yet enabled for unbuilt Model subclasses')
@@ -1958,6 +2081,10 @@
     Raises:
         ValueError: In case the generator yields data in an invalid format.
     """
+    if self._distribution_strategy:
+      raise NotImplementedError('`evaluate_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
           '`evaluate_generator` is not yet enabled for '
@@ -2012,6 +2139,10 @@
     Raises:
         ValueError: In case the generator yields data in an invalid format.
     """
+    if self._distribution_strategy:
+      raise NotImplementedError('`predict_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
           '`predict_generator` is not yet enabled for unbuilt Model subclasses')
@@ -2025,6 +2156,21 @@
         use_multiprocessing=use_multiprocessing,
         verbose=verbose)
 
+  def _get_callback_model(self):
+    """Returns the Callback Model for this Model."""
+
+    if hasattr(self, '_replicated_model') and self._replicated_model:
+      # When using training_distributed, we set the callback model
+      # to an instance of the `DistributedModel` that we create in
+      # the `compile` call. The `DistributedModel` is initialized
+      # with the first replicated model. We need to set the callback
+      # model to a DistributedModel to allow us to override saving
+      # and loading weights when we checkpoint the model during training.
+      return self._replicated_model
+    if hasattr(self, 'callback_model') and self.callback_model:
+      return self.callback_model
+    return self
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
@@ -2065,4 +2211,3 @@
       logging.warning('You are accessing attribute ' + item + 'of the'
                       'DistributedCallbackModel that may not have been set'
                       'correctly.')
-
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index d24f4b6..e2c458c 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -19,8 +19,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 import numpy as np
 
 from tensorflow.python.framework import errors
@@ -92,14 +90,8 @@
   val_sample_weights = val_sample_weights or []
   if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
     ins = inputs + targets + sample_weights + [1]
-    if val_inputs:
-      val_ins = val_inputs + val_targets + val_sample_weights + [1]
   else:
     ins = inputs + targets + sample_weights
-    if val_inputs:
-      val_ins = val_inputs + val_targets + val_sample_weights
-  if not val_inputs:
-    val_ins = []
 
   do_validation = False
   if val_inputs:
@@ -116,65 +108,27 @@
                        'training, i.e. `steps_per_epoch` '
                        'must be set.')
 
-  out_labels = model.metrics_names
-  if do_validation:
-    callback_metrics = copy.copy(out_labels) + ['val_' + n for n in out_labels]
-    # need to create the test_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the test_function
-    model._make_test_function()
-  else:
-    callback_metrics = copy.copy(out_labels)
-
   num_train_samples = training_utils.check_num_samples(
       ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+  count_mode = 'steps' if steps_per_epoch else 'samples'
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      val_inputs=val_inputs,
+      val_targets=val_targets,
+      val_sample_weights=val_sample_weights,
+      batch_size=batch_size,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      samples=num_train_samples,
+      validation_steps=validation_steps,
+      verbose=verbose,
+      count_mode=count_mode)
+
   if num_train_samples is not None:
     index_array = np.arange(num_train_samples)
 
-  model.history = cbks.History()
-  all_callbacks = [cbks.BaseLogger(
-      stateful_metrics=model.stateful_metric_names)]
-  if verbose:
-    if steps_per_epoch is not None:
-      count_mode = 'steps'
-    else:
-      count_mode = 'samples'
-    all_callbacks.append(
-        cbks.ProgbarLogger(
-            count_mode, stateful_metrics=model.stateful_metric_names))
-  all_callbacks += (callbacks or []) + [model.history]
-  callbacks = cbks.CallbackList(all_callbacks)
-  out_labels = out_labels or []
-
-  # it's possible to callback a different model than self
-  # (used by Sequential models)
-  if hasattr(model, 'callback_model') and model.callback_model:
-    callback_model = model.callback_model
-  else:
-    callback_model = model
-
-  callbacks.set_model(callback_model)
-
-  callback_params = {
-      'batch_size': batch_size,
-      'epochs': epochs,
-      'steps': steps_per_epoch,
-      'samples': num_train_samples,
-      'verbose': verbose,
-      'do_validation': do_validation,
-      'metrics': callback_metrics or [],
-  }
-  if validation_steps:
-    callback_params.update({'validation_steps': validation_steps})
-  callbacks.set_params(callback_params)
-
-  for cbk in callbacks:
-    cbk.validation_data = val_ins
-  # validation_data must be set before on_train_begin() is called
-  # so that TensorboardCallback can validate its input
-  callbacks.on_train_begin()
-  callback_model.stop_training = False
-
   # To prevent a slowdown, we find beforehand the arrays that need conversion.
   feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
   indices_for_conversion_to_dense = []
@@ -182,6 +136,7 @@
     if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
       indices_for_conversion_to_dense.append(i)
 
+  callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
     # Reset stateful metrics
     for m in model.stateful_metric_functions:
@@ -208,11 +163,11 @@
 
         if not isinstance(outs, list):
           outs = [outs]
-        for l, o in zip(out_labels, outs):
+        for l, o in zip(model.metrics_names, outs):
           batch_logs[l] = o
 
         callbacks.on_batch_end(step_index, batch_logs)
-        if callback_model.stop_training:
+        if callbacks.model.stop_training:
           break
 
       if do_validation:
@@ -226,7 +181,7 @@
         if not isinstance(val_outs, list):
           val_outs = [val_outs]
         # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
+        for l, o in zip(model.metrics_names, val_outs):
           epoch_logs['val_' + l] = o
     else:
       # Sample-wise fit loop.
@@ -259,11 +214,11 @@
         outs = f(ins_batch)
         if not isinstance(outs, list):
           outs = [outs]
-        for l, o in zip(out_labels, outs):
+        for l, o in zip(model.metrics_names, outs):
           batch_logs[l] = o
 
         callbacks.on_batch_end(batch_index, batch_logs)
-        if callback_model.stop_training:
+        if callbacks.model.stop_training:
           break
 
         if batch_index == len(batches) - 1:  # Last batch.
@@ -278,10 +233,10 @@
             if not isinstance(val_outs, list):
               val_outs = [val_outs]
             # Same labels assumed.
-            for l, o in zip(out_labels, val_outs):
+            for l, o in zip(model.metrics_names, val_outs):
               epoch_logs['val_' + l] = o
     callbacks.on_epoch_end(epoch, epoch_logs)
-    if callback_model.stop_training:
+    if callbacks.model.stop_training:
       break
   callbacks.on_train_end()
   return model.history
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 5fa6c3c..5feedc4 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -18,7 +18,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import copy
 import numpy as np
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
@@ -38,7 +37,6 @@
     callbacks=None,
     val_inputs=None,
     val_targets=None,
-    callback_metrics=None,
     initial_epoch=0,
     steps_per_epoch=None,
     validation_steps=None):
@@ -53,10 +51,6 @@
       callbacks: List of callbacks to be called during training
       val_inputs: List of input arrays.
       val_targets: List of target arrays.
-      callback_metrics: List of strings, the display names of the metrics
-          passed to the callbacks. They should be the
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
       initial_epoch: Epoch at which to start training
           (useful for resuming a previous training run)
       steps_per_epoch: Total number of steps (batches of samples)
@@ -126,50 +120,6 @@
                        'when doing step-wise '
                        'training, i.e. `steps_per_epoch` '
                        'must be set.')
-  out_labels = model.metrics_names
-  if do_validation:
-    callback_metrics = copy.copy(out_labels) + [
-        'val_' + n for n in out_labels
-    ]
-  else:
-    callback_metrics = copy.copy(out_labels)
-
-  model.history = cbks.History()
-  all_callbacks = [cbks.BaseLogger(
-      stateful_metrics=model.stateful_metric_names)]
-  if verbose:
-    # We assume that `steps_per_epoch` is always set since we have to use
-    # Datasets.
-    count_mode = 'steps'
-
-    all_callbacks.append(
-        cbks.ProgbarLogger(
-            count_mode, stateful_metrics=model.stateful_metric_names))
-  all_callbacks += (callbacks or []) + [model.history]
-  callbacks = cbks.CallbackList(all_callbacks)
-  out_labels = out_labels or []
-
-  # We set the callback model to an instance of the `DistributedModel` that we
-  # create in the  `compile` call. The `DistributedModel` is initialized with
-  # the first replicated model. We need to set the callback model to a
-  # DistributedModel to allow us to override saving and loading weights when
-  # we checkpoint the model during training.
-  callback_model = model._replicated_model
-
-  callbacks.set_model(callback_model)
-
-  callbacks.set_params({
-      'epochs': epochs,
-      'steps': steps_per_epoch,
-      'samples': None,
-      'verbose': verbose,
-      'do_validation': do_validation,
-      'metrics': callback_metrics or [],
-  })
-  callbacks.on_train_begin()
-  callback_model.stop_training = False
-
-  out_labels = out_labels or []
 
   # Copy the weights from the original model to each of the replicated models.
   orig_model_weights = model.get_weights()
@@ -178,6 +128,17 @@
     distributed_training_utils.set_weights(
         current_strategy, distributed_model, orig_model_weights)
 
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      val_inputs=None,
+      val_targets=None,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      verbose=verbose)
+  out_labels = model.metrics_names or []
+  callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
     callbacks.on_epoch_begin(epoch)
     if steps_per_epoch is not None:
@@ -203,7 +164,7 @@
         for l, o in zip(out_labels, outs):
           batch_logs[l] = o
         callbacks.on_batch_end(step_index, batch_logs)
-        if callback_model.stop_training:
+        if callbacks.model.stop_training:
           break
       if do_validation:
         val_outs = test_loop(
@@ -219,7 +180,7 @@
           epoch_logs['val_' + l] = o
 
     callbacks.on_epoch_end(epoch, epoch_logs)
-    if callback_model.stop_training:
+    if callbacks.model.stop_training:
       break
   callbacks.on_train_end()
 
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 774d2e4..1e37714 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -41,39 +41,25 @@
   return loss
 
 
-def _eager_metrics_fn(model, outputs, targets):
+def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
   """Calculates the metrics for each output of the given model.
 
   Arguments:
       model: The model on which metrics are being calculated.
       outputs: The outputs of the given model.
       targets: The predictions or targets of the given model.
+      sample_weights: Optional list of sample weights for each output.
+      masks: Optional list of masks for each output.
 
   Returns:
       Returns the metric results for each output of the model.
   """
-  metric_results = []
-  if not isinstance(outputs, list):
-    outputs = [outputs]
-
-  if not isinstance(targets, list):
-    targets = [targets]
-
-  for i in range(len(model.outputs)):
-    output_metrics = model.nested_metrics[i]
-    for nested_output_metric in output_metrics:
-      metric_fn = training_utils.get_metric_function(
-          nested_output_metric, backend.int_shape(model.outputs[i]),
-          model.loss_functions[i])
-      # weighted metrics are not supported in eager mode
-      metric_name = training_utils.get_metric_name(
-          nested_output_metric, weighted=False)
-
-      with backend.name_scope(metric_name):
-        metric_result = metric_fn(targets[i], outputs[i])
-        metric_results.append(backend.mean(metric_result))
-
-  return metric_results
+  outputs = generic_utils.to_list(outputs)
+  targets = generic_utils.to_list(targets)
+  # TODO(psv): Consider supporting skip target indices in eager mode?
+  metric_results = model._handle_metrics(
+      outputs, targets=targets, sample_weights=sample_weights, masks=masks)
+  return [backend.mean(t) for t in metric_results]
 
 
 def _model_loss(model, inputs, targets, sample_weights=None, training=False):
@@ -87,9 +73,10 @@
       training: Whether the model should be run in inference or training mode.
 
   Returns:
-     Returns the model output, total loss and loss value calculated using the
-     specified loss function. The total loss includes regularization losses and
-     applies masking and sample weighting to the loss value.
+     Returns the model output, total loss, loss value calculated using the
+     specified loss function and masks for each output. The total loss includes
+     regularization losses and applies masking and sample weighting
+     to the loss value.
   """
   total_loss = 0
   kwargs = {}
@@ -98,7 +85,7 @@
   if len(inputs) == 1:
     inputs = inputs[0]
 
-  if model._is_graph_network:
+  if model._compute_output_and_mask_jointly:
     outs, masks = model._call_and_compute_mask(inputs, **kwargs)
     masks = generic_utils.to_list(masks)
   else:
@@ -146,15 +133,13 @@
     if custom_losses:
       total_loss += sum(custom_losses)
 
-  return outs, total_loss, loss_metrics
+  return outs, total_loss, loss_metrics, masks
 
 
 def iterator_fit_loop(model,
                       inputs,
                       class_weight,
                       steps_per_epoch,
-                      callback_model,
-                      out_labels,
                       epoch_logs,
                       val_inputs=None,
                       val_targets=None,
@@ -162,7 +147,6 @@
                       epochs=1,
                       verbose=1,
                       callbacks=None,
-                      callback_metrics=None,
                       validation_steps=None,
                       do_validation=False,
                       batch_size=None):
@@ -179,19 +163,13 @@
       steps_per_epoch: Total number of steps (batches of samples)
           before declaring one epoch finished and starting the
           next epoch.
-      callback_model: Instance of `Model` to callback.
-      out_labels: Output labels generated from model metric names.
       epoch_logs: Dictionary of logs from every epoch.
       val_inputs: Input data for validation.
       val_targets: Target data for validation.
       val_sample_weights: Sample weight data for validation.
       epochs: Number of times to iterate over the data
       verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      callback_metrics: List of strings, the display names of the metrics
-          passed to the callbacks. They should be the
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
+      callbacks: CallbackList instance. Controls callbacks during training.
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with default value of `None`.
       do_validation: Boolean value indicating whether we should do validation.
@@ -244,40 +222,47 @@
           if val is not None else None for val in sample_weights
       ]
 
-    if step_index == 0 and not callback_metrics:
-      out_labels = model.metrics_names
+    # Set stateful_metrics in callbacks. We do not do this before the
+    # `steps_per_epoch` loop because model will be compiled only in the first
+    # iteration of this loop in the deferred build scenario.
+    if step_index == 0:
+      for cbk in callbacks:
+        if (isinstance(cbk, cbks.BaseLogger) or
+            isinstance(cbk, cbks.ProgbarLogger)):
+          cbk.stateful_metrics = model.stateful_metric_names
+
+    if step_index == 0 and not callbacks.params['metrics']:
+      callback_metrics = copy.copy(model.metrics_names)
       if do_validation:
-        callback_metrics = copy.copy(out_labels) + [
-            'val_' + n for n in out_labels
-        ]
-      else:
-        callback_metrics = copy.copy(out_labels)
+        callback_metrics += ['val_' + n for n in model.metrics_names]
       callbacks.set_params({
+          'batch_size': batch_size,
           'epochs': epochs,
           'steps': steps_per_epoch,
           'verbose': verbose,
           'do_validation': do_validation,
           'metrics': callback_metrics or [],
+          'validation_steps': validation_steps
       })
 
     # Train model.
-    outs, loss, loss_metrics = _process_single_batch(
+    outs, loss, loss_metrics, masks = _process_single_batch(
         model, x, y, sample_weights=sample_weights, training=True)
-    if not isinstance(outs, list):
-      outs = [outs]
+    outs = generic_utils.to_list(outs)
 
     # Calculate metrics.
-    for l, o in zip(out_labels, outs):
+    for l, o in zip(model.metrics_names, outs):
       batch_logs[l] = o
     # Required for eager execution
-    metrics_results = _eager_metrics_fn(model, outs, y)
+    metrics_results = _eager_metrics_fn(
+        model, outs, y, sample_weights=sample_weights, masks=masks)
     batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
 
     for k, v in zip(model.metrics_names,
                     [backend.mean(loss)] + loss_metrics + metrics_results):
       batch_logs[k] = tensor_util.constant_value(v)
     callbacks.on_batch_end(step_index, batch_logs)
-    if callback_model.stop_training:
+    if callbacks.model.stop_training:
       break
 
     if step_index == steps_per_epoch - 1:
@@ -293,7 +278,7 @@
         if not isinstance(val_outs, list):
           val_outs = [val_outs]
         # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
+        for l, o in zip(model.metrics_names, val_outs):
           epoch_logs['val_' + l] = o
 
 
@@ -357,10 +342,25 @@
           if val is not None else None for val in sample_weights
       ]
 
+    if step_index == 0:
+      # Get stateful metrics indices. We do not do this before the `steps` loop
+      # because model will be compiled only in the first iteration of this loop
+      # in the deferred build scenario.
+      if hasattr(model, 'metrics'):
+        for m in model.stateful_metric_functions:
+          m.reset_states()
+        stateful_metric_indices = [
+            i for i, name in enumerate(model.metrics_names)
+            if str(name) in model.stateful_metric_names
+        ]
+      else:
+        stateful_metric_indices = []
+
     # Calculate model output, loss values.
-    loss_outs, loss, loss_metrics = _model_loss(
+    loss_outs, loss, loss_metrics, masks = _model_loss(
         model, x, y, sample_weights=sample_weights, training=False)
-    metrics_results = _eager_metrics_fn(model, loss_outs, y)
+    metrics_results = _eager_metrics_fn(
+        model, loss_outs, y, sample_weights=sample_weights, masks=masks)
     batch_outs = []
     for _, v in zip(model.metrics_names,
                     [backend.mean(loss)] + loss_metrics + metrics_results):
@@ -379,7 +379,10 @@
       for _ in enumerate(batch_outs):
         outs.append(0.)
     for i, batch_out in enumerate(batch_outs):
-      outs[i] += batch_out * step_size
+      if i in stateful_metric_indices:
+        outs[i] = batch_out
+      else:
+        outs[i] += batch_out * step_size
 
     # Calculate sample size.
     num_samples += step_size
@@ -387,7 +390,8 @@
       progbar.update(step_index + 1)
 
   for i in range(len(outs)):
-    outs[i] /= num_samples
+    if i not in stateful_metric_indices:
+      outs[i] /= num_samples
   if len(outs) == 1:
     return outs[0]
   return outs
@@ -484,16 +488,20 @@
               set this to False.
 
   Returns:
-      output of the model, total loss and the loss associated with each output.
+      output of the model, total loss, the loss and the mask
+      associated with each output.
 
   Raises:
       ValueError: If the model has no loss to optimize.
   """
   with backend.learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
-                                             sample_weights=sample_weights,
-                                             training=training)
+      outs, loss, loss_metrics, masks = _model_loss(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=training)
       if loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
@@ -506,7 +514,7 @@
         grads = tape.gradient(loss, model._collected_trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
                                             model._collected_trainable_weights))
-    return outs, loss, loss_metrics
+    return outs, loss, loss_metrics, masks
 
 
 def train_on_batch(model, inputs, targets, sample_weights=None):
@@ -537,14 +545,18 @@
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, _ = _process_single_batch(
+  outs, loss, loss_metrics, masks = _process_single_batch(
       model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
-  metrics_results = _eager_metrics_fn(model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + metrics_results
+  metrics_results = _eager_metrics_fn(
+      model, outs, targets, sample_weights=sample_weights, masks=masks)
+  loss = generic_utils.to_list(loss)
+
+  return [
+      tensor_util.constant_value(v)
+      for v in loss + loss_metrics + metrics_results
+  ]
 
 
 def test_on_batch(model, inputs, targets, sample_weights=None):
@@ -574,14 +586,18 @@
         ops.convert_to_tensor(val, dtype=backend.floatx())
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics = _model_loss(
+  outs, loss, loss_metrics, masks = _model_loss(
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  metrics_results = _eager_metrics_fn(model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + loss_metrics + metrics_results
+  metrics_results = _eager_metrics_fn(
+      model, outs, targets, sample_weights=sample_weights, masks=masks)
+  loss = generic_utils.to_list(loss)
+
+  return [
+      tensor_util.constant_value(v)
+      for v in loss + loss_metrics + metrics_results
+  ]
 
 
 def fit_loop(model,
@@ -643,65 +659,26 @@
       shuffle=shuffle)
   # Required for eager execution
   with backend.learning_phase_scope(1):
-    do_validation = False
-    if val_inputs:
-      do_validation = True
+    do_validation = val_inputs is not None
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        batch_size=batch_size,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        val_inputs=val_inputs,
+        val_targets=val_targets,
+        val_sample_weights=val_sample_weights,
+        validation_steps=validation_steps,
+        verbose=verbose)
 
-    num_train_samples = None
-    out_labels = None
-    callback_metrics = None
-    if model._is_compiled:
-      out_labels = model.metrics_names
-      if do_validation:
-        callback_metrics = copy.copy(out_labels) + [
-            'val_' + n for n in out_labels
-        ]
-      else:
-        callback_metrics = copy.copy(out_labels)
-
-    model.history = cbks.History()
-    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
-    if verbose:
-      callbacks += [cbks.ProgbarLogger('steps')]
-    callbacks = cbks.CallbackList(callbacks)
-
-    # it's possible to callback a different model than self
-    # (used by Sequential models)
-    if hasattr(model, 'callback_model') and model.callback_model:
-      callback_model = model.callback_model
-    else:
-      callback_model = model
-
-    callbacks.set_model(callback_model)
-
-    callback_params = {
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps_per_epoch,
-        'samples': num_train_samples,
-        'verbose': verbose,
-        'do_validation': do_validation,
-        'metrics': callback_metrics or [],
-    }
-    if validation_steps:
-      callback_params.update({'validation_steps': validation_steps})
-    callbacks.set_params(callback_params)
-
-    for cbk in callbacks:
-      if not val_inputs:
-        cbk.validation_data = []
-      elif isinstance(val_inputs, iterator_ops.EagerIterator):
-        cbk.validation_data = val_inputs
-      elif val_sample_weights:
-        cbk.validation_data = val_inputs + val_targets + val_sample_weights
-      else:
-        cbk.validation_data = val_inputs + val_targets
-    # validation_data must be set before on_train_begin() is called
-    # so that TensorboardCallback can validate its input
     callbacks.on_train_begin()
-    callback_model.stop_training = False
-
     for epoch in range(initial_epoch, epochs):
+      if model._is_compiled:  # Model may not be compiled the first time.
+        # Reset stateful metrics
+        for m in model.stateful_metric_functions:
+          m.reset_states()
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
       iterator_fit_loop(
@@ -709,8 +686,6 @@
           inputs,
           class_weight,
           steps_per_epoch=steps_per_epoch,
-          callback_model=callback_model,
-          out_labels=out_labels,
           epoch_logs=epoch_logs,
           val_inputs=val_inputs,
           val_targets=val_targets,
@@ -718,12 +693,11 @@
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
-          callback_metrics=callback_metrics,
           validation_steps=validation_steps,
           do_validation=do_validation,
           batch_size=batch_size)
       callbacks.on_epoch_end(epoch, epoch_logs)
-      if callback_model.stop_training:
+      if callbacks.model.stop_training:
         break
   callbacks.on_train_end()
   return model.history
@@ -763,10 +737,7 @@
     return iterator_test_loop(model, inputs, steps, verbose=verbose)
 
 
-def predict_loop(model, inputs,
-                 batch_size=32,
-                 verbose=0,
-                 steps=None):
+def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
   """Predict function for eager execution.
 
   Arguments:
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 56f3217..db7ccb1 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -44,7 +45,7 @@
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
-    metrics = ['mae']
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(
         optimizer,
         loss,
@@ -109,7 +110,7 @@
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
-    metrics = ['mae']
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics)
 
     inputs = keras.backend.zeros(shape=(10, 3))
@@ -128,7 +129,9 @@
     model = keras.Sequential()
     model.add(keras.layers.Dense(4, input_shape=(3,)))
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'mse', metrics=['mae'])
+    model.compile(
+        optimizer, 'mse', metrics=['mae',
+                                   metrics_module.CategoricalAccuracy()])
 
     x = np.random.random((10, 3))
     y = np.random.random((10, 4))
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 432cf2b..413c1f4 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
@@ -79,66 +78,37 @@
                      ' class. Please specify `validation_steps` or use'
                      ' the `keras.utils.Sequence` class.')
 
-  # Prepare display labels.
-  out_labels = model.metrics_names
-  callback_metrics = out_labels + ['val_%s' % n for n in out_labels]
-
-  # prepare callbacks
-  model.history = cbks.History()
-  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
-  if verbose:
-    callbacks += [cbks.ProgbarLogger(count_mode='steps')]
-  callbacks = cbks.CallbackList(callbacks)
-
-  # it's possible to callback a different model than self:
-  if hasattr(model, 'callback_model') and model.callback_model:
-    callback_model = model.callback_model
-  else:
-    callback_model = model
-  callbacks.set_model(callback_model)
-
-  callback_params = {
-      'epochs': epochs,
-      'steps': steps_per_epoch,
-      'verbose': verbose,
-      'do_validation': do_validation,
-      'metrics': callback_metrics,
-  }
-  if do_validation:
-    # need to create the test_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the test_function
-    model._make_test_function()
-    # determine the number of validation batches given a generator
-    if validation_steps:
-      callback_params.update({'validation_steps': validation_steps})
-    elif isinstance(validation_data, Sequence):
-      callback_params.update({'validation_steps': len(validation_data)})
-  callbacks.set_params(callback_params)
-
   enqueuer = None
   val_enqueuer = None
 
   try:
+    val_x, val_y, val_sample_weights = validation_data, None, None
     if do_validation and not val_gen:
       # Prepare data for validation
       if len(validation_data) == 2:
         val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
+        val_sample_weights = None
       elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+        val_x, val_y, val_sample_weights = validation_data  # pylint: disable=unpacking-non-sequence
       else:
         raise ValueError(
             '`validation_data` should be a tuple '
             '`(val_x, val_y, val_sample_weight)` '
             'or `(val_x, val_y)`. Found: ' + str(validation_data))
       val_x, val_y, val_sample_weights = model._standardize_user_data(
-          val_x, val_y, val_sample_weight)
-      val_data = val_x + val_y + val_sample_weights
-      if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_data += [0.]
-      for cbk in callbacks:
-        cbk.validation_data = val_data
+          val_x, val_y, val_sample_weights)
+
+    callbacks = cbks.configure_callbacks(
+        callbacks,
+        model,
+        do_validation=do_validation,
+        val_inputs=val_x,
+        val_targets=val_y,
+        val_sample_weights=val_sample_weights,
+        epochs=epochs,
+        validation_steps=validation_steps,
+        steps_per_epoch=steps_per_epoch,
+        verbose=verbose)
 
     if workers > 0:
       if is_sequence:
@@ -159,9 +129,6 @@
       else:
         output_generator = generator
 
-    callback_model.stop_training = False
-    # validation_data must be set before on_train_begin() is called
-    # so that TensorboardCallback can validate its input
     callbacks.on_train_begin()
     # Construct epoch logs.
     epoch_logs = {}
@@ -205,7 +172,7 @@
 
         if not isinstance(outs, list):
           outs = [outs]
-        for l, o in zip(out_labels, outs):
+        for l, o in zip(model.metrics_names, outs):
           batch_logs[l] = o
 
         callbacks.on_batch_end(batch_index, batch_logs)
@@ -235,15 +202,15 @@
           if not isinstance(val_outs, list):
             val_outs = [val_outs]
           # Same labels assumed.
-          for l, o in zip(out_labels, val_outs):
+          for l, o in zip(model.metrics_names, val_outs):
             epoch_logs['val_' + l] = o
 
-        if callback_model.stop_training:
+        if callbacks.model.stop_training:
           break
 
       callbacks.on_epoch_end(epoch, epoch_logs)
       epoch += 1
-      if callback_model.stop_training:
+      if callbacks.model.stop_training:
         break
 
   finally:
@@ -266,7 +233,6 @@
                        use_multiprocessing=False,
                        verbose=0):
   """See docstring for `Model.evaluate_generator`."""
-  stateful_metric_indices = []
   if hasattr(model, 'metrics'):
     for m in model.stateful_metric_functions:
       m.reset_states()
@@ -364,7 +330,7 @@
         averages.append(
             np.average([out[i] for out in all_outs], weights=batch_sizes))
       else:
-        averages.append(float(all_outs[-1][i]))
+        averages.append(np.float64(all_outs[-1][i]))
     return averages
 
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 753519f..15e7d72 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -30,6 +30,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
@@ -62,8 +63,11 @@
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       loss_weights = [1., 0.5]
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+          loss_weights=loss_weights)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 3))
@@ -178,8 +182,10 @@
 
       # Test with lists for loss, metrics
       loss = ['mae', 'mse']
-      metrics = ['acc', 'mae']
-      model.compile(optimizer, loss, metrics=metrics)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           epochs=1,
@@ -189,7 +195,10 @@
       # Test with dictionaries for loss, metrics, loss weights
       loss = {'dense': 'mse', 'dropout': 'mae'}
       loss_weights = {'dense': 1., 'dropout': 0.5}
-      metrics = {'dense': 'mse', 'dropout': 'mae'}
+      metrics = {
+          'dense': 'mse',
+          'dropout': metrics_module.CategoricalAccuracy()
+      }
       model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -258,11 +267,10 @@
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       loss_weights = [1., 0.5]
-      metrics = ['mae']
       model.compile(
           optimizer,
           loss,
-          metrics=metrics,
+          metrics=['mae', metrics_module.CategoricalAccuracy()],
           loss_weights=loss_weights,
           sample_weight_mode=None)
 
@@ -277,20 +285,20 @@
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           batch_size=5,
           verbose=0)
-      self.assertEqual(len(out), 5)
+      self.assertEqual(len(out), 7)
       out = model.evaluate(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           batch_size=5,
           verbose=1)
-      self.assertEqual(len(out), 5)
+      self.assertEqual(len(out), 7)
       out = model.evaluate(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           batch_size=5,
           verbose=2)
-      self.assertEqual(len(out), 5)
+      self.assertEqual(len(out), 7)
       out = model.test_on_batch([input_a_np, input_b_np],
                                 [output_d_np, output_e_np])
-      self.assertEqual(len(out), 5)
+      self.assertEqual(len(out), 7)
 
       # Test evaluate with dictionary inputs
       model.evaluate(
@@ -326,7 +334,7 @@
       self.assertEqual(len(out), 2)
 
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_invalid_loss_or_metrics(self):
+  def test_invalid_loss(self):
     num_classes = 5
     train_samples = 1000
     test_samples = 1000
@@ -350,10 +358,6 @@
       with self.assertRaises(ValueError):
         model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
 
-      with self.assertRaises(TypeError):
-        model.compile(
-            optimizer, loss='categorical_crossentropy', metrics=set(0))
-
       if not context.executing_eagerly():
         # TODO(psv): Investigate these use cases in eager mode.
         with self.assertRaises(ValueError):
@@ -379,7 +383,11 @@
       out2 = keras.layers.Dense(4, name='dense_1')(in2)
       model = keras.Model([in1, in2], [out1, out2])
       model.predict(test_inputs, batch_size=2)
-      model.compile('rmsprop', 'mse')
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      model.compile(
+          optimizer,
+          'mse',
+          metrics=['mae', metrics_module.CategoricalAccuracy()])
       model.fit(test_inputs, test_outputs,
                 epochs=1, batch_size=2, validation_split=0.5)
       model.evaluate(test_inputs, test_outputs, batch_size=2)
@@ -422,22 +430,24 @@
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.test_session():
       inp = keras.layers.Input(shape=(16,), name='input_a')
       out_1 = keras.layers.Dense(8, name='dense_1')(inp)
       out_2 = keras.layers.Dense(3, activation='softmax', name='dense_2')(out_1)
       model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
 
       with test.mock.patch.object(logging, 'warning') as mock_log:
         model.compile(
+            optimizer,
             loss={
                 'dense_2': 'categorical_crossentropy',
             },
-            optimizer='rmsprop',
             metrics={
                 'dense_2': 'categorical_accuracy',
-                'dense_1': 'categorical_accuracy',
+                'dense_1': metrics_module.CategoricalAccuracy(),
             })
         msg = ('Output "dense_1" missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
@@ -466,6 +476,8 @@
       model.add(keras.layers.Activation('softmax'))
       model.compile(
           loss='categorical_crossentropy',
+          metrics=['acc'],
+          weighted_metrics=['mae'],
           optimizer=RMSPropOptimizer(learning_rate=learning_rate))
 
       np.random.seed(1337)
@@ -516,7 +528,7 @@
       ref_score = model.evaluate(x_test, y_test, verbose=0)
       score = model.evaluate(
           x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-      self.assertLess(score, ref_score)
+      self.assertLess(score[0], ref_score[0])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sample_weights(self):
@@ -537,6 +549,8 @@
       model.add(keras.layers.Activation('softmax'))
       model.compile(
           RMSPropOptimizer(learning_rate=learning_rate),
+          metrics=['acc'],
+          weighted_metrics=['mae'],
           loss='categorical_crossentropy')
 
       np.random.seed(43)
@@ -583,7 +597,7 @@
       if not context.executing_eagerly():
         score = model.evaluate(
             x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-        self.assertLess(score, ref_score)
+        self.assertLess(score[0], ref_score[0])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_temporal_sample_weights(self):
@@ -641,6 +655,8 @@
       model.compile(
           RMSPropOptimizer(learning_rate=learning_rate),
           loss='binary_crossentropy',
+          metrics=['acc'],
+          weighted_metrics=['mae'],
           sample_weight_mode='temporal')
 
       model.fit(
@@ -671,7 +687,7 @@
       if not context.executing_eagerly():
         score = model.evaluate(
             temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
-        self.assertLess(score, ref_score)
+        self.assertLess(score[0], ref_score[0])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_class_weight_invalid_use_case(self):
@@ -794,7 +810,7 @@
 class LossMaskingTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
-  def test_masking(self):
+  def test_masking_graph_sequential(self):
     with self.test_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -808,6 +824,34 @@
       self.assertEqual(float(loss), 0.)
 
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_masking_deferred_sequential(self):
+    with self.test_session():
+      x = np.array([[[1], [1]], [[0], [0]]])
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(mask_value=0))
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(1, kernel_initializer='one')))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      y = np.array([[[1], [1]], [[1], [1]]])
+      loss = model.train_on_batch(x, y)
+      self.assertEqual(float(loss), 0.)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_masking_functional(self):
+    with self.test_session():
+      x = np.array([[[1], [1]], [[0], [0]]])
+      inputs = keras.layers.Input((2, 1))
+      outputs = keras.layers.Masking(mask_value=0)(inputs)
+      outputs = keras.layers.TimeDistributed(
+          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
+      model = keras.Model(inputs, outputs)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      y = np.array([[[1], [1]], [[1], [1]]])
+      loss = model.train_on_batch(x, y)
+      self.assertEqual(float(loss), 0.)
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_mask_argument_in_layer(self):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
@@ -1038,7 +1082,10 @@
       x = keras.Input((2,))
       y = keras.layers.Dense(1)(x)
       fn_model = keras.models.Model(x, y)
-      fn_model.compile(loss='mse', optimizer='sgd')
+      fn_model.compile(
+          loss='mse',
+          optimizer='sgd',
+          metrics=['mae', metrics_module.CategoricalAccuracy()])
 
       seq_model = keras.models.Sequential()
       seq_model.add(keras.layers.Dense(1, input_shape=(2,)))
@@ -1120,7 +1167,10 @@
     with self.test_session():
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+      model.compile(
+          loss='mse',
+          optimizer='sgd',
+          metrics=['mae', metrics_module.CategoricalAccuracy()])
 
       model.fit_generator(custom_generator(),
                           steps_per_epoch=5,
@@ -1272,10 +1322,12 @@
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
 
-      optimizer = 'rmsprop'
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=['mae', metrics_module.CategoricalAccuracy()])
 
       inputs = keras.backend.zeros(shape=(10, 3))
       targets = keras.backend.zeros(shape=(10, 4))
@@ -1319,8 +1371,11 @@
       optimizer = 'rmsprop'
       loss = 'mse'
       loss_weights = [1., 0.5]
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+      model.compile(
+          optimizer,
+          loss,
+          metrics=['mae', metrics_module.CategoricalAccuracy()],
+          loss_weights=loss_weights)
 
       input_a_tf = keras.backend.zeros(shape=(10, 3))
       input_b_tf = keras.backend.zeros(shape=(10, 3))
@@ -1758,8 +1813,11 @@
       model.train_on_batch(input_val, None)
 
       # test with sample weights
-      model.compile(optimizer='rmsprop', loss='mse',
-                    target_tensors=[target_a, target_b])
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          metrics=['mae', metrics_module.CategoricalAccuracy()],
+          target_tensors=[target_a, target_b])
       model.train_on_batch(input_val, None,
                            sample_weight={'dense_a': np.random.random((10,))})
 
@@ -1823,30 +1881,6 @@
       model.train_on_batch([input_a_np, input_b_np],
                            [output_a_np, output_b_np])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_metric_names_are_identical_in_graph_and_eager(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
-
-    dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-    model = keras.models.Model([a, b], [d, e])
-
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    loss_weights = [1., 0.5]
-    metrics = ['mae', 'acc']
-    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
-    reference_metric_names = ['loss', 'dense_loss', 'dropout_loss',
-                              'dense_mean_absolute_error',
-                              'dense_acc',
-                              'dropout_mean_absolute_error',
-                              'dropout_acc']
-    self.assertEqual(reference_metric_names, model.metrics_names)
-
 
 class TestTrainingWithDatasetIterators(test.TestCase):
 
@@ -1859,7 +1893,7 @@
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      metrics = ['mae']
+      metrics = ['mae', metrics_module.CategoricalAccuracy()]
       model.compile(optimizer, loss, metrics=metrics)
 
       inputs = np.zeros((10, 3))
@@ -1916,6 +1950,7 @@
                                    'you should specify the `steps` argument'):
         model.predict(iterator, verbose=0)
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_get_next_op_created_once(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
@@ -1968,6 +2003,7 @@
 
 class TestTrainingWithDataset(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_calling_model_on_same_dataset(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
@@ -2003,7 +2039,7 @@
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      metrics = ['mae']
+      metrics = ['mae', metrics_module.CategoricalAccuracy()]
       model.compile(optimizer, loss, metrics=metrics)
 
       inputs = np.zeros((10, 3))
@@ -2094,6 +2130,28 @@
   """Training tests related to metrics."""
 
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_metrics_names(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    metrics = ['mse', metrics_module.BinaryAccuracy()]
+    model.compile(optimizer, loss='mae', metrics=metrics)
+    reference_metric_names = [
+        'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
+        'dense_binary_accuracy', 'dropout_mean_squared_error',
+        'dropout_binary_accuracy'
+    ]
+    self.assertEqual(reference_metric_names, model.metrics_names)
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness(self):
     with self.test_session():
       model = keras.Sequential()
@@ -2105,7 +2163,7 @@
               1, activation='sigmoid', kernel_initializer='ones'))
       model.compile(
           loss='mae',
-          metrics=['accuracy'],
+          metrics=['accuracy', metrics_module.BinaryAccuracy()],
           optimizer=RMSPropOptimizer(learning_rate=0.001))
 
       # verify correctness of stateful and stateless metrics.
@@ -2113,41 +2171,48 @@
       y = np.ones((100, 1))
       outs = model.evaluate(x, y)
       self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
 
       y = np.zeros((100, 1))
       outs = model.evaluate(x, y)
       self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy'],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+    with self.test_session():
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              8, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='binary_crossentropy',
+          metrics=['accuracy', metrics_module.BinaryAccuracy()],
+          optimizer=RMSPropOptimizer(learning_rate=0.001))
 
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+      np.random.seed(123)
+      x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+      y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+      outs = model.evaluate(iterator, steps=10)
+      self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+      self.assertEqual(np.around(outs[2], decimals=1), 0.5)
 
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(outs[1], 0.)
+      y = np.zeros((100, 1), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+      outs = model.evaluate(iterator, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     with self.test_session():
       np.random.seed(1337)
@@ -2161,19 +2226,87 @@
           RMSPropOptimizer(learning_rate=0.001),
           loss='mse',
           sample_weight_mode='temporal',
-          weighted_metrics=['accuracy'])
+          weighted_metrics=['accuracy',
+                            metrics_module.BinaryAccuracy()])
       y = np.array([[[1.], [1.]], [[1.], [1.]]])
 
       outs = model.evaluate(x, y)
-      self.assertEqual(outs, [0.5, 0.5])
+      self.assertEqual(outs, [0.5, 0.5, 0.5])
 
       w = np.array([[0., 0.], [0., 0.]])
       outs = model.evaluate(x, y, sample_weight=w)
-      self.assertEqual(outs, [0., 0.])
+      self.assertEqual(outs, [0., 0., 0.])
 
       w = np.array([[3., 4.], [1., 2.]])
       outs = model.evaluate(x, y, sample_weight=w)
-      self.assertArrayNear(outs, [0.3, 0.7], .001)
+      self.assertArrayNear(outs, [0.3, 0.7, 0.7], .001)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_metric_state_reset_between_fit_and_evaluate(self):
+    with self.test_session():
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
+      model.add(keras.layers.Dense(1, activation='sigmoid'))
+      acc_obj = metrics_module.BinaryAccuracy()
+      model.compile(
+          loss='mae',
+          metrics=[acc_obj],
+          optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x_train = np.random.random((100, 4))
+      y_train = np.random.random((100, 1))
+      model.fit(x_train, y_train, batch_size=5, epochs=2)
+      self.assertEqual(self.evaluate(acc_obj.count), 100)
+
+      x_test = np.random.random((10, 4))
+      y_test = np.random.random((10, 1))
+      model.evaluate(x_test, y_test, batch_size=5)
+      self.assertEqual(self.evaluate(acc_obj.count), 10)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_invalid_metrics(self):
+    num_classes = 5
+    input_dim = 5
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(10, activation='relu', input_shape=(input_dim,)))
+      model.add(keras.layers.Dense(num_classes, activation='softmax'))
+
+      with self.assertRaisesRegexp(
+          TypeError, 'Type of `metrics` argument not understood. '
+          'Expected a list or dictionary, found: '):
+        model.compile(
+            RMSPropOptimizer(learning_rate=0.001),
+            loss='categorical_crossentropy',
+            metrics=metrics_module.CategoricalAccuracy())
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_metrics_masking(self):
+    with self.test_session():
+      np.random.seed(1337)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(1, kernel_initializer='ones')))
+      model.compile(
+          RMSPropOptimizer(learning_rate=0.001),
+          loss='mse',
+          weighted_metrics=['accuracy',
+                            metrics_module.BinaryAccuracy()])
+
+      # verify that masking is applied for stateless and stateful metrics.
+      x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
+      y = np.array([[[1], [1]], [[0], [1]], [[1], [1]]])
+      scores = model.train_on_batch(x, y)
+      self.assertArrayNear(scores, [0.25, 0.75, 0.75], 0.1)
+
+      # verify that masking is combined with sample weights.
+      w = np.array([3, 2, 4])
+      scores = model.train_on_batch(x, y, sample_weight=w)
+      self.assertArrayNear(scores, [0.2, 0.8, 0.8], 0.1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 38b64e6..f94697c 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -570,13 +570,24 @@
     # score_array has ndim >= 2
     score_array = fn(y_true, y_pred)
     if mask is not None:
-      # Cast the mask to floatX to avoid float64 upcasting in theano
-      mask = math_ops.cast(mask, K.floatx())
-      # mask should have the same shape as score_array
-      score_array *= mask
-      #  the loss per batch should be proportional
-      #  to the number of unmasked samples.
-      score_array /= K.mean(mask)
+      mask = math_ops.cast(mask, y_pred.dtype)
+      # Update weights with mask.
+      if weights is None:
+        weights = mask
+      else:
+        # Update shape of weights if possible before adding mask.
+        # Update dimensions of weights to match with mask if possible.
+        mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
+            mask, None, weights)
+        try:
+          # Broadcast weights if possible.
+          weights = weights_broadcast_ops.broadcast_weights(weights, mask)
+          weights *= mask
+        except ValueError:
+          score_array *= mask
+          score_array /= K.mean(mask)
+          # TODO(psv): Handle case when mask and weight shapes are not
+          # compatible.
 
     # Apply sample weighting.
     if weights is not None:
@@ -709,43 +720,6 @@
   return tensor_util.is_tensor(ls)
 
 
-def populate_metric_names(model):
-  for i in range(len(model.outputs)):
-    metrics = model.nested_metrics[i]
-    for metric in metrics:
-      base_metric_name = get_metric_name(metric)
-      add_metric_name(model, base_metric_name, i)
-
-
-def get_metric_name(metric, weighted=False):
-  """Returns the metric name corresponding to the given metric input.
-
-  Arguments:
-      metric: Metric function name or reference.
-      weighted: Boolean indicating if the given metric is weighted.
-
-  Returns:
-      a metric name.
-  """
-  metric_name_prefix = 'weighted_' if weighted else ''
-  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-    if metric in ('accuracy', 'acc'):
-      suffix = 'acc'
-    elif metric in ('crossentropy', 'ce'):
-      suffix = 'ce'
-    metric_name = metric_name_prefix + suffix
-  else:
-    metric_fn = metrics_module.get(metric)
-    # Get metric name as string
-    if hasattr(metric_fn, 'name'):
-      metric_name = metric_fn.name
-    else:
-      metric_name = metric_fn.__name__
-    metric_name = metric_name_prefix + metric_name
-
-  return metric_name
-
-
 def get_metric_function(metric, output_shape=None, loss_fn=None):
   """Returns the metric function corresponding to the given metric input.
 
@@ -776,33 +750,6 @@
   return metrics_module.get(metric)
 
 
-def add_metric_name(model, metric_name, index):
-  """Makes the metric name unique and adds it to the model's metric name list.
-
-    If there are multiple outputs for which the metrics are calculated, the
-    metric names have to be made unique by appending an integer.
-
-  Arguments:
-    model: Model to which we are adding metric names.
-    metric_name: Metric name that corresponds to the metric specified by the
-        user. For example: 'acc'
-    index: The index of the model output for which the metric name is being
-        added.
-
-  Returns:
-    string, name of the model's unique metric name
-  """
-  if len(model.output_names) > 1:
-    metric_name = '%s_%s' % (model.output_names[index], metric_name)
-  j = 1
-  base_metric_name = metric_name
-  while metric_name in model.metrics_names:
-    metric_name = '%s_%d' % (base_metric_name, j)
-    j += 1
-  model.metrics_names.append(metric_name)
-  return metric_name
-
-
 def validate_iterator_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 2a05699..a103b9f 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -21,9 +21,11 @@
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
@@ -103,6 +105,30 @@
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  def test_temporal_classification_sequential_tf_rnn(self):
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
+                                 input_shape=x_train.shape[1:]))
+      model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
+                                                  activation='softmax',
+                                                  dtype=dtypes.float32)))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
+                          validation_data=(x_train, y_train),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
+
   def test_image_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 2a42816..4032202 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -681,9 +681,8 @@
                         'must be a list, a tuple, or a function.')
       self._output_shape = output_shape
 
+  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-
     if self._output_shape is None:
       if context.executing_eagerly():
         raise NotImplementedError
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 0ff392e..49ca68e 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -188,6 +188,14 @@
     ld = keras.layers.Lambda.from_config(config)
 
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_lambda_multiple_inputs(self):
+    ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+    out = ld([x1, x2])
+    self.assertAllEqual(out.shape, [3, 2])
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_dense(self):
     testing_utils.layer_test(
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 0ebafe0..33d09a1 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -85,6 +85,28 @@
           the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
+      implementation: implementation mode, either `1` or `2`.
+          `1` loops over input spatial locations to perform the forward pass.
+          It is memory-efficient but performs a lot of (small) ops.
+
+          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+          and implements the forward pass as a single matrix-multiply. It uses
+          a lot of RAM but performs few (large) ops.
+
+          Depending on the inputs, layer parameters, hardware, and
+          `tf.executing_eagerly()` one implementation can be dramatically faster
+          (e.g. 50X) than another.
+
+          It is recommended to benchmark both in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage).
+
+          Following scenarios could benefit from setting `implementation=2`:
+              - eager execution;
+              - inference;
+              - running on CPU;
+              - large amount of RAM available;
+              - small models (few filters, small kernel);
+              - using `padding=same` (only possible with `implementation=2`).
 
   Input shape:
       3D tensor with shape: `(batch_size, steps, input_dim)`
@@ -109,15 +131,17 @@
                activity_regularizer=None,
                kernel_constraint=None,
                bias_constraint=None,
+               implementation=1,
                **kwargs):
     super(LocallyConnected1D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
     self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid':
+    if self.padding != 'valid' and implementation == 1:
       raise ValueError('Invalid border mode for LocallyConnected1D '
-                       '(only "valid" is supported): ' + padding)
+                       '(only "valid" is supported if implementation is 1): '
+                       + padding)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -128,6 +152,7 @@
     self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
+    self.implementation = implementation
     self.input_spec = InputSpec(ndim=3)
 
   @tf_utils.shape_type_conversion
@@ -142,14 +167,45 @@
                        'Found shape:', input_shape)
     self.output_length = conv_utils.conv_output_length(
         input_length, self.kernel_size[0], self.padding, self.strides[0])
-    self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
-                         self.filters)
-    self.kernel = self.add_weight(
-        shape=self.kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+
+    if self.implementation == 1:
+      self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
+                           self.filters)
+
+      self.kernel = self.add_weight(
+          shape=self.kernel_shape,
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
+    elif self.implementation == 2:
+      if self.data_format == 'channels_first':
+        self.kernel_shape = (input_dim, input_length,
+                             self.filters, self.output_length)
+      else:
+        self.kernel_shape = (input_length, input_dim,
+                             self.output_length, self.filters)
+
+      self.kernel = self.add_weight(shape=self.kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    name='kernel',
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint)
+
+      self.kernel_mask = get_locallyconnected_mask(
+          input_shape=(input_length,),
+          kernel_shape=self.kernel_size,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dtype=self.kernel.dtype
+      )
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
+
     if self.use_bias:
       self.bias = self.add_weight(
           shape=(self.output_length, self.filters),
@@ -182,8 +238,17 @@
       return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
-                          (self.output_length,), self.data_format)
+    if self.implementation == 1:
+      output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
+                            (self.output_length,), self.data_format)
+
+    elif self.implementation == 2:
+      output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
+                                 self.compute_output_shape(inputs.shape))
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
 
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
@@ -220,7 +285,9 @@
         'kernel_constraint':
             constraints.serialize(self.kernel_constraint),
         'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+            constraints.serialize(self.bias_constraint),
+        'implementation':
+            self.implementation
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -284,9 +351,31 @@
           the `kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
+          the output of the layer (its "activation").
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
+      implementation: implementation mode, either `1` or `2`.
+          `1` loops over input spatial locations to perform the forward pass.
+          It is memory-efficient but performs a lot of (small) ops.
+
+          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+          and implements the forward pass as a single matrix-multiply. It uses
+          a lot of RAM but performs few (large) ops.
+
+          Depending on the inputs, layer parameters, hardware, and
+          `tf.executing_eagerly()` one implementation can be dramatically faster
+          (e.g. 50X) than another.
+
+          It is recommended to benchmark both in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage).
+
+          Following scenarios could benefit from setting `implementation=2`:
+              - eager execution;
+              - inference;
+              - running on CPU;
+              - large amount of RAM available;
+              - small models (few filters, small kernel);
+              - using `padding=same` (only possible with `implementation=2`).
 
   Input shape:
       4D tensor with shape:
@@ -317,15 +406,17 @@
                activity_regularizer=None,
                kernel_constraint=None,
                bias_constraint=None,
+               implementation=1,
                **kwargs):
     super(LocallyConnected2D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
     self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid':
+    if self.padding != 'valid' and implementation == 1:
       raise ValueError('Invalid border mode for LocallyConnected2D '
-                       '(only "valid" is supported): ' + padding)
+                       '(only "valid" is supported if implementation is 1): '
+                       + padding)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -336,6 +427,7 @@
     self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
+    self.implementation = implementation
     self.input_spec = InputSpec(ndim=4)
 
   @tf_utils.shape_type_conversion
@@ -357,15 +449,47 @@
                                                self.padding, self.strides[1])
     self.output_row = output_row
     self.output_col = output_col
-    self.kernel_shape = (
-        output_row * output_col,
-        self.kernel_size[0] * self.kernel_size[1] * input_filter, self.filters)
-    self.kernel = self.add_weight(
-        shape=self.kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+
+    if self.implementation == 1:
+      self.kernel_shape = (
+          output_row * output_col,
+          self.kernel_size[0] * self.kernel_size[1] * input_filter,
+          self.filters)
+
+      self.kernel = self.add_weight(
+          shape=self.kernel_shape,
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
+    elif self.implementation == 2:
+      if self.data_format == 'channels_first':
+        self.kernel_shape = (input_filter, input_row, input_col,
+                             self.filters, self.output_row, self.output_col)
+      else:
+        self.kernel_shape = (input_row, input_col, input_filter,
+                             self.output_row, self.output_col, self.filters)
+
+      self.kernel = self.add_weight(shape=self.kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    name='kernel',
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint)
+
+      self.kernel_mask = get_locallyconnected_mask(
+          input_shape=(input_row, input_col),
+          kernel_shape=self.kernel_size,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dtype=self.kernel.dtype
+      )
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
+
     if self.use_bias:
       self.bias = self.add_weight(
           shape=(output_row, output_col, self.filters),
@@ -401,8 +525,18 @@
       return (input_shape[0], rows, cols, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
-                          (self.output_row, self.output_col), self.data_format)
+    if self.implementation == 1:
+      output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
+                            (self.output_row, self.output_col),
+                            self.data_format)
+
+    elif self.implementation == 2:
+      output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
+                                 self.compute_output_shape(inputs.shape))
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
 
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
@@ -439,7 +573,157 @@
         'kernel_constraint':
             constraints.serialize(self.kernel_constraint),
         'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+            constraints.serialize(self.bias_constraint),
+        'implementation':
+            self.implementation
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def get_locallyconnected_mask(input_shape,
+                              kernel_shape,
+                              strides,
+                              padding,
+                              data_format,
+                              dtype):
+  """Return a mask representing connectivity of a locally-connected operation.
+
+  This method returns a masking tensor of 0s and 1s (of type `dtype`) that,
+  when element-wise multiplied with a fully-connected weight tensor, masks out
+  the weights between disconnected input-output pairs and thus implements local
+  connectivity through a sparse fully-connected weight tensor.
+
+  Assume an unshared convolution with given parameters is applied to an input
+  having N spatial dimensions with `input_shape = (d_in1, ..., d_inN)`
+  to produce an output with spatial shape `(d_out1, ..., d_outN)` (determined
+  by layer parameters such as `strides`).
+
+  This method returns a mask which can be broadcast-multiplied (element-wise)
+  with a 2*(N+1)-D weight matrix (equivalent to a fully-connected layer between
+  (N+1)-D activations (N spatial + 1 channel dimensions for input and output)
+  to make it perform an unshared convolution with given `kernel_shape`,
+  `strides`, `padding` and `data_format`.
+
+  Arguments:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+    data_format: a string, `"channels_first"` or `"channels_last"`.
+    dtype: type of the layer operation, e.g. `tf.float64`.
+
+  Returns:
+    a `dtype`-tensor of shape
+    `(1, d_in1, ..., d_inN, 1, d_out1, ..., d_outN)`
+    if `data_format == `"channels_first"`, or
+    `(d_in1, ..., d_inN, 1, d_out1, ..., d_outN, 1)`
+    if `data_format == "channels_last"`.
+
+  Raises:
+    ValueError: if `data_format` is neither `"channels_first"` nor
+                `"channels_last"`.
+  """
+  mask = conv_utils.conv_kernel_mask(
+      input_shape=input_shape,
+      kernel_shape=kernel_shape,
+      strides=strides,
+      padding=padding
+  )
+
+  ndims = int(mask.ndim / 2)
+  mask = K.variable(mask, dtype)
+
+  if data_format == 'channels_first':
+    mask = K.expand_dims(mask, 0)
+    mask = K.expand_dims(mask, - ndims - 1)
+
+  elif data_format == 'channels_last':
+    mask = K.expand_dims(mask, ndims)
+    mask = K.expand_dims(mask, -1)
+
+  else:
+    raise ValueError('Unrecognized data_format: ' + str(data_format))
+
+  return mask
+
+
+def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
+  """Apply N-D convolution with un-shared weights using a single matmul call.
+
+  This method outputs `inputs . (kernel * kernel_mask)`
+  (with `.` standing for matrix-multiply and `*` for element-wise multiply)
+  and requires a precomputed `kernel_mask` to zero-out weights in `kernel` and
+  hence perform the same operation as a convolution with un-shared
+  (the remaining entries in `kernel`) weights. It also does the necessary
+  reshapes to make `inputs` and `kernel` 2-D and `output` (N+2)-D.
+
+  Arguments:
+      inputs: (N+2)-D tensor with shape
+          `(batch_size, channels_in, d_in1, ..., d_inN)`
+          or
+          `(batch_size, d_in1, ..., d_inN, channels_in)`.
+      kernel: the unshared weights for N-D convolution,
+          an (N+2)-D tensor of shape:
+          `(d_in1, ..., d_inN, channels_in, d_out2, ..., d_outN, channels_out)`
+          or
+          `(channels_in, d_in1, ..., d_inN, channels_out, d_out2, ..., d_outN)`,
+          with the ordering of channels and spatial dimensions matching
+          that of the input.
+          Each entry is the weight between a particular input and
+          output location, similarly to a fully-connected weight matrix.
+      kernel_mask: a float 0/1 mask tensor of shape:
+           `(d_in1, ..., d_inN, 1, d_out2, ..., d_outN, 1)`
+           or
+           `(1, d_in1, ..., d_inN, 1, d_out2, ..., d_outN)`,
+           with the ordering of singleton and spatial dimensions
+           matching that of the input.
+           Mask represents the connectivity pattern of the layer and is
+           precomputed elsewhere based on layer parameters: stride,
+           padding, and the receptive field shape.
+      output_shape: a tuple of (N+2) elements representing the output shape:
+          `(batch_size, channels_out, d_out1, ..., d_outN)`
+          or
+          `(batch_size, d_out1, ..., d_outN, channels_out)`,
+          with the ordering of channels and spatial dimensions matching that of
+          the input.
+
+  Returns:
+      Output (N+2)-D tensor with shape `output_shape`.
+  """
+  inputs_flat = K.reshape(inputs, (K.shape(inputs)[0], -1))
+
+  kernel = kernel_mask * kernel
+  kernel = make_2d(kernel, split_dim=K.ndim(kernel) // 2)
+
+  output_flat = K.math_ops.sparse_matmul(inputs_flat, kernel, b_is_sparse=True)
+  output = K.reshape(output_flat,
+                     [K.shape(output_flat)[0],] + output_shape.as_list()[1:])
+  return output
+
+
+def make_2d(tensor, split_dim):
+  """Reshapes an N-dimensional tensor into a 2D tensor.
+
+  Dimensions before (excluding) and after (including) `split_dim` are grouped
+  together.
+
+  Arguments:
+    tensor: a tensor of shape `(d0, ..., d(N-1))`.
+    split_dim: an integer from 1 to N-1, index of the dimension to group
+        dimensions before (excluding) and after (including).
+
+  Returns:
+    Tensor of shape
+    `(d0 * ... * d(split_dim-1), d(split_dim) * ... * d(N-1))`.
+  """
+  shape = K.array_ops.shape(tensor)
+  in_dims = shape[:split_dim]
+  out_dims = shape[split_dim:]
+
+  in_size = K.math_ops.reduce_prod(in_dims)
+  out_size = K.math_ops.reduce_prod(out_dims)
+
+  return K.array_ops.reshape(tensor, (in_size, out_size))
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 9639e02..4781bca 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class LocallyConnectedLayersTest(test.TestCase):
@@ -36,21 +37,30 @@
     filter_length = 3
     filters = 4
 
-    for padding in ['valid']:
+    for padding in ['valid', 'same']:
       for strides in [1]:
         if padding == 'same' and strides != 1:
           continue
         for data_format in ['channels_first', 'channels_last']:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected1D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': filter_length,
-                  'padding': padding,
-                  'strides': strides,
-                  'data_format': data_format
-              },
-              input_shape=(num_samples, num_steps, input_dim))
+          for implementation in [1, 2]:
+            kwargs = {
+                'filters': filters,
+                'kernel_size': filter_length,
+                'padding': padding,
+                'strides': strides,
+                'data_format': data_format,
+                'implementation': implementation
+            }
+
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected1D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected1D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -59,38 +69,47 @@
     filter_length = 3
     filters = 4
     for data_format in ['channels_first', 'channels_last']:
-      kwargs = {
-          'filters': filters,
-          'kernel_size': filter_length,
-          'kernel_regularizer': 'l2',
-          'bias_regularizer': 'l2',
-          'activity_regularizer': 'l2',
-          'data_format': data_format
-      }
+      for padding in ['valid', 'same']:
+        for implementation in [1, 2]:
+          kwargs = {
+              'filters': filters,
+              'kernel_size': filter_length,
+              'kernel_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'data_format': data_format,
+              'implementation': implementation,
+              'padding': padding
+          }
 
-      with self.test_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertEqual(len(layer.losses), 2)
-        layer(
-            keras.backend.variable(np.ones((num_samples,
-                                            num_steps,
-                                            input_dim))))
-        self.assertEqual(len(layer.losses), 3)
+          if padding == 'same' and implementation == 1:
+            self.assertRaises(ValueError,
+                              keras.layers.LocallyConnected1D,
+                              **kwargs)
+          else:
+            with self.test_session():
+              layer = keras.layers.LocallyConnected1D(**kwargs)
+              layer.build((num_samples, num_steps, input_dim))
+              self.assertEqual(len(layer.losses), 2)
+              layer(
+                  keras.backend.variable(np.ones((num_samples,
+                                                  num_steps,
+                                                  input_dim))))
+              self.assertEqual(len(layer.losses), 3)
 
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      kwargs = {
-          'filters': filters,
-          'kernel_size': filter_length,
-          'kernel_constraint': k_constraint,
-          'bias_constraint': b_constraint,
-      }
-      with self.test_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertEqual(layer.kernel.constraint, k_constraint)
-        self.assertEqual(layer.bias.constraint, b_constraint)
+            k_constraint = keras.constraints.max_norm(0.01)
+            b_constraint = keras.constraints.max_norm(0.01)
+            kwargs = {
+                'filters': filters,
+                'kernel_size': filter_length,
+                'kernel_constraint': k_constraint,
+                'bias_constraint': b_constraint,
+            }
+            with self.test_session():
+              layer = keras.layers.LocallyConnected1D(**kwargs)
+              layer.build((num_samples, num_steps, input_dim))
+              self.assertEqual(layer.kernel.constraint, k_constraint)
+              self.assertEqual(layer.bias.constraint, b_constraint)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_2d(self):
@@ -100,23 +119,32 @@
     num_row = 6
     num_col = 10
 
-    for padding in ['valid']:
+    for padding in ['valid', 'same']:
       for strides in [(1, 1), (2, 2)]:
-        if padding == 'same' and strides != (1, 1):
-          continue
+        for implementation in [1, 2]:
+          if padding == 'same' and strides != (1, 1):
+            continue
 
-        testing_utils.layer_test(
-            keras.layers.LocallyConnected2D,
-            kwargs={
-                'filters': filters,
-                'kernel_size': 3,
-                'padding': padding,
-                'kernel_regularizer': 'l2',
-                'bias_regularizer': 'l2',
-                'strides': strides,
-                'data_format': 'channels_last'
-            },
-            input_shape=(num_samples, num_row, num_col, stack_size))
+          kwargs = {
+              'filters': filters,
+              'kernel_size': 3,
+              'padding': padding,
+              'kernel_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'strides': strides,
+              'data_format': 'channels_last',
+              'implementation': implementation
+          }
+
+          if padding == 'same' and implementation == 1:
+            self.assertRaises(ValueError,
+                              keras.layers.LocallyConnected2D,
+                              **kwargs)
+          else:
+            testing_utils.layer_test(
+                keras.layers.LocallyConnected2D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_2d_channels_first(self):
@@ -126,14 +154,25 @@
     num_row = 6
     num_col = 10
 
-    testing_utils.layer_test(
-        keras.layers.LocallyConnected2D,
-        kwargs={
+    for implementation in [1, 2]:
+      for padding in ['valid', 'same']:
+        kwargs = {
             'filters': filters,
             'kernel_size': 3,
-            'data_format': 'channels_first'
-        },
-        input_shape=(num_samples, num_row, num_col, stack_size))
+            'data_format': 'channels_first',
+            'implementation': implementation,
+            'padding': padding
+        }
+
+        if padding == 'same' and implementation == 1:
+          self.assertRaises(ValueError,
+                            keras.layers.LocallyConnected2D,
+                            **kwargs)
+        else:
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected2D,
+              kwargs=kwargs,
+              input_shape=(num_samples, num_row, num_col, stack_size))
 
   def test_locallyconnected_2d_regularization(self):
     num_samples = 8
@@ -141,35 +180,271 @@
     stack_size = 4
     num_row = 6
     num_col = 10
-    kwargs = {
-        'filters': filters,
-        'kernel_size': 3,
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-    }
-    with self.test_session():
-      layer = keras.layers.LocallyConnected2D(**kwargs)
-      layer.build((num_samples, num_row, num_col, stack_size))
-      self.assertEqual(len(layer.losses), 2)
-      layer(
-          keras.backend.variable(
-              np.ones((num_samples, num_row, num_col, stack_size))))
-      self.assertEqual(len(layer.losses), 3)
+    for implementation in [1, 2]:
+      for padding in ['valid', 'same']:
+        kwargs = {
+            'filters': filters,
+            'kernel_size': 3,
+            'kernel_regularizer': 'l2',
+            'bias_regularizer': 'l2',
+            'activity_regularizer': 'l2',
+            'implementation': implementation,
+            'padding': padding
+        }
 
-    k_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    kwargs = {
-        'filters': filters,
-        'kernel_size': 3,
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-    }
-    with self.test_session():
-      layer = keras.layers.LocallyConnected2D(**kwargs)
-      layer.build((num_samples, num_row, num_col, stack_size))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+        if padding == 'same' and implementation == 1:
+          self.assertRaises(ValueError,
+                            keras.layers.LocallyConnected2D,
+                            **kwargs)
+        else:
+          with self.test_session():
+            layer = keras.layers.LocallyConnected2D(**kwargs)
+            layer.build((num_samples, num_row, num_col, stack_size))
+            self.assertEqual(len(layer.losses), 2)
+            layer(
+                keras.backend.variable(
+                    np.ones((num_samples, num_row, num_col, stack_size))))
+            self.assertEqual(len(layer.losses), 3)
+
+          k_constraint = keras.constraints.max_norm(0.01)
+          b_constraint = keras.constraints.max_norm(0.01)
+          kwargs = {
+              'filters': filters,
+              'kernel_size': 3,
+              'kernel_constraint': k_constraint,
+              'bias_constraint': b_constraint,
+          }
+          with self.test_session():
+            layer = keras.layers.LocallyConnected2D(**kwargs)
+            layer.build((num_samples, num_row, num_col, stack_size))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_locallyconnected_implementation(self):
+    n_train = 4
+    n_classes = 3
+    n_epochs = 2
+
+    np.random.seed(1)
+    targets = np.random.randint(0, n_classes, (n_train,))
+
+    for width in [1, 17]:
+      for height in [16]:
+        for filters in [2]:
+          for data_format in ['channels_first', 'channels_last']:
+            inputs = get_inputs(data_format, filters, height, n_train, width)
+
+            for kernel_x in [(3,)]:
+              for kernel_y in [()] if width == 1 else [(2,)]:
+                for stride_x in [(1,)]:
+                  for stride_y in [()] if width == 1 else [(3,)]:
+                    for layers in [2]:
+                      kwargs = {
+                          'layers': layers,
+                          'filters': filters,
+                          'kernel_size': kernel_x + kernel_y,
+                          'strides': stride_x + stride_y,
+                          'data_format': data_format,
+                          'n_classes': n_classes,
+                          'input_shape': inputs.shape
+                      }
+
+                      model_1 = get_model(implementation=1, **kwargs)
+                      model_2 = get_model(implementation=2, **kwargs)
+
+                      copy_model_weights(model_2, model_1)
+
+                      # Compare outputs at initialization.
+                      out_1 = model_1.call(inputs)
+                      out_2 = model_2.call(inputs)
+                      self.assertAllCloseAccordingToType(out_1, out_2,
+                                                         rtol=1e-5, atol=1e-5)
+
+                      # Train.
+                      model_1.fit(x=inputs,
+                                  y=targets,
+                                  epochs=n_epochs,
+                                  batch_size=n_train)
+
+                      model_2.fit(x=inputs,
+                                  y=targets,
+                                  epochs=n_epochs,
+                                  batch_size=n_train)
+
+                      # Compare outputs after a few training steps.
+                      out_1 = model_1.call(inputs)
+                      out_2 = model_2.call(inputs)
+                      self.assertAllCloseAccordingToType(out_1, out_2,
+                                                         rtol=1e-5, atol=1e-5)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_make_2d(self):
+    input_shapes = [
+        (0,),
+        (0, 0),
+        (1,),
+        (2,),
+        (3,),
+        (1, 0),
+        (0, 3),
+        (1, 1),
+        (1, 2),
+        (3, 1),
+        (2, 2),
+        (3, 3),
+        (1, 0, 1),
+        (5, 2, 3),
+        (3, 5, 6, 7, 0),
+        (3, 2, 2, 4, 4),
+        (1, 2, 3, 4, 7, 2),
+    ]
+    np.random.seed(1)
+
+    for input_shape in input_shapes:
+      inputs = np.random.normal(0, 1, input_shape)
+      inputs_tf = keras.backend.variable(inputs)
+
+      split_dim = np.random.randint(0, inputs.ndim + 1)
+      shape_2d = (int(np.prod(inputs.shape[:split_dim])),
+                  int(np.prod(inputs.shape[split_dim:])))
+      inputs_2d = np.reshape(inputs, shape_2d)
+
+      inputs_2d_tf = keras.layers.local.make_2d(inputs_tf, split_dim)
+      inputs_2d_tf = keras.backend.get_value(inputs_2d_tf)
+
+      self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
+
+
+def get_inputs(data_format, filters, height, n_train, width):
+  if data_format == 'channels_first':
+    if width == 1:
+      input_shape = (filters, height)
+    else:
+      input_shape = (filters, height, width)
+
+  elif data_format == 'channels_last':
+    if width == 1:
+      input_shape = (height, filters)
+    else:
+      input_shape = (height, width, filters)
+
+  else:
+    raise NotImplementedError(data_format)
+
+  inputs = np.random.normal(0, 1,
+                            (n_train,) + input_shape).astype(np.float32)
+  return inputs
+
+
+def xent(y_true, y_pred):
+  y_true = keras.backend.cast(
+      keras.backend.reshape(y_true, (-1,)),
+      keras.backend.dtypes_module.int32)
+
+  return keras.backend.nn.sparse_softmax_cross_entropy_with_logits(
+      labels=y_true,
+      logits=y_pred)
+
+
+def get_model(implementation,
+              filters,
+              kernel_size,
+              strides,
+              layers,
+              n_classes,
+              data_format,
+              input_shape):
+  model = keras.Sequential()
+
+  if len(kernel_size) == 1:
+    lc_layer = keras.layers.LocallyConnected1D
+  elif len(kernel_size) == 2:
+    lc_layer = keras.layers.LocallyConnected2D
+  else:
+    raise NotImplementedError(kernel_size)
+
+  for _ in range(layers):
+    model.add(lc_layer(
+        padding='valid',
+        kernel_initializer=keras.initializers.random_normal(),
+        bias_initializer=keras.initializers.random_normal(),
+        filters=filters,
+        strides=strides,
+        kernel_size=kernel_size,
+        activation=keras.activations.relu,
+        data_format=data_format,
+        implementation=implementation))
+
+  model.add(keras.layers.Flatten())
+  model.add(keras.layers.Dense(n_classes))
+  model.compile(
+      optimizer=RMSPropOptimizer(0.01),
+      metrics=[keras.metrics.categorical_accuracy],
+      loss=xent
+  )
+  model.build(input_shape)
+  return model
+
+
+def copy_lc_weights(lc_layer_2_from, lc_layer_1_to):
+  lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
+  lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
+
+  data_format = lc_layer_2_from.data_format
+
+  if data_format == 'channels_first':
+    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+      permutation = (3, 0, 1, 2)
+    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+      permutation = (4, 5, 0, 1, 2, 3)
+    else:
+      raise NotImplementedError(lc_layer_2_from)
+
+  elif data_format == 'channels_last':
+    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+      permutation = (2, 0, 1, 3)
+    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+      permutation = (3, 4, 0, 1, 2, 5)
+    else:
+      raise NotImplementedError(lc_layer_2_from)
+
+  else:
+    raise NotImplementedError(data_format)
+
+  lc_2_kernel_masked = keras.backend.permute_dimensions(
+      lc_2_kernel_masked, permutation)
+
+  lc_2_kernel_mask = keras.backend.math_ops.not_equal(
+      lc_2_kernel_masked, 0)
+  lc_2_kernel_flat = keras.backend.array_ops.boolean_mask(
+      lc_2_kernel_masked, lc_2_kernel_mask)
+  lc_2_kernel_reshaped = keras.backend.reshape(lc_2_kernel_flat,
+                                               lc_layer_1_to.kernel.shape)
+
+  lc_2_kernel_reshaped = keras.backend.get_value(lc_2_kernel_reshaped)
+  lc_2_bias = keras.backend.get_value(lc_2_bias)
+
+  lc_layer_1_to.set_weights([lc_2_kernel_reshaped, lc_2_bias])
+
+
+def copy_model_weights(model_2_from, model_1_to):
+  for l in range(len(model_2_from.layers)):
+    layer_2_from = model_2_from.layers[l]
+    layer_1_to = model_1_to.layers[l]
+
+    if isinstance(layer_2_from, (keras.layers.LocallyConnected2D,
+                                 keras.layers.LocallyConnected1D)):
+      copy_lc_weights(layer_2_from, layer_1_to)
+
+    elif isinstance(layer_2_from, keras.layers.Dense):
+      weights_2, bias_2 = layer_2_from.weights
+      weights_2 = keras.backend.get_value(weights_2)
+      bias_2 = keras.backend.get_value(bias_2)
+      layer_1_to.set_weights([weights_2, bias_2])
+
+    else:
+      continue
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index a7835bc..cd26e04 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -36,7 +36,7 @@
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -345,16 +345,16 @@
               aggregation=variable_scope.VariableAggregation.MEAN)
           return var
 
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_mean):
+        with distribution_strategy_context.get_distribution_strategy(
+        ).colocate_vars_with(self.moving_mean):
           self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
           self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
         # We initialize renorm_stddev to 0, and maintain the (0-initialized)
         # renorm_stddev_weight. This allows us to (1) mix the average
         # stddev with the minibatch stddev early in training, and (2) compute
         # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_variance):
+        with distribution_strategy_context.get_distribution_strategy(
+        ).colocate_vars_with(self.moving_variance):
           self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
           self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
                                                        ())
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index a8bfdf2..12c82a5 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,10 +19,10 @@
 from __future__ import division
 from __future__ import print_function
 
-import numbers
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
@@ -37,6 +37,7 @@
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -86,17 +87,24 @@
     # (assuming one LSTM has states [h, c])
     state_size = []
     for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
+      if _is_multiple_state(cell.state_size):
         state_size += list(cell.state_size)
       else:
         state_size.append(cell.state_size)
     return tuple(state_size)
 
+  @property
+  def output_size(self):
+    if hasattr(self.cells[-1], 'output_size'):
+      return self.cells[-1].output_size
+    else:
+      return self.state_size[0]
+
   def call(self, inputs, states, constants=None, **kwargs):
     # Recover per-cell states.
     nested_states = []
     for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
+      if _is_multiple_state(cell.state_size):
         nested_states.append(states[:len(cell.state_size)])
         states = states[len(cell.state_size):]
       else:
@@ -133,11 +141,12 @@
           cell.build([input_shape] + constants_shape)
         else:
           cell.build(input_shape)
-      if hasattr(cell.state_size, '__len__'):
+      if _is_multiple_state(cell.state_size):
         output_dim = cell.state_size[0]
       else:
         output_dim = cell.state_size
-      input_shape = (input_shape[0], output_dim)
+      input_shape = tuple([input_shape[0]] +
+                          tensor_shape.as_shape(output_dim).as_list())
     self.built = True
 
   def get_config(self):
@@ -242,13 +251,16 @@
               cell can also take the optional argument `constants`, see
               section "Note on passing external constants" below.
           - a `state_size` attribute. This can be a single integer
-              (single state) in which case it is
-              the size of the recurrent state
-              (which should be the same as the size of the cell output).
-              This can also be a list/tuple of integers
-              (one size per state). In this case, the first entry
-              (`state_size[0]`) should be the same as
-              the size of the cell output.
+              (single state) in which case it is the size of the recurrent
+              state. This can also be a list/tuple of integers (one size per
+              state).
+              The `state_size` can also be TensorShape or tuple/list of
+              TensorShape, to represent high dimension state.
+          - a `output_size` attribute. This can be a single integer or a
+              TensorShape, which represent the shape of the output. For backward
+              compatible reason, if this attribute is not available for the
+              cell, the value will be inferred by the first element of the
+              `state_size`.
           In the case that `cell` is a list of RNN cell instances, the cells
           will be stacked on after the other in the RNN, implementing an
           efficient stacked RNN.
@@ -268,9 +280,8 @@
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
-      input_dim: dimensionality of the input (integer).
-          This argument (or alternatively,
-          the keyword argument `input_shape`)
+      input_dim: dimensionality of the input (integer or tuple of integers).
+          This argument (or alternatively, the keyword argument `input_shape`)
           is required when using this layer as the first layer in a model.
       input_length: Length of input sequences, to be specified
           when it is constant.
@@ -283,15 +294,18 @@
           (e.g. via the `input_shape` argument)
 
   Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`.
+      N-D tensor with shape `(batch_size, timesteps, ...)`.
 
   Output shape:
       - if `return_state`: a list of tensors. The first tensor is
           the output. The remaining tensors are the last states,
-          each with shape `(batch_size, units)`.
-      - if `return_sequences`: 3D tensor with shape
-          `(batch_size, timesteps, units)`.
-      - else, 2D tensor with shape `(batch_size, units)`.
+          each with shape `(batch_size, state_size)`, where `state_size` could
+          be a high dimension tensor shape.
+      - if `return_sequences`: N-D tensor with shape
+          `(batch_size, timesteps, output_size)`, where `output_size` could
+          be a high dimension tensor shape.
+      - else, N-D tensor with shape `(batch_size, output_size)`, where
+          `output_size` could be a high dimension tensor shape.
 
   # Masking
       This layer supports masking for input data with a variable number
@@ -412,7 +426,7 @@
     self.unroll = unroll
 
     self.supports_masking = True
-    self.input_spec = [InputSpec(ndim=3)]
+    self.input_spec = [None]  # The input shape is unknown yet, at least rank 3.
     self.state_spec = None
     self._states = None
     self.constants_spec = None
@@ -421,11 +435,8 @@
   @property
   def states(self):
     if self._states is None:
-      if isinstance(self.cell.state_size, numbers.Integral):
-        num_states = 1
-      else:
-        num_states = len(self.cell.state_size)
-      return [None for _ in range(num_states)]
+      state = nest.map_structure(lambda _: None, self.cell.state_size)
+      return state if nest.is_sequence(self.cell.state_size) else [state]
     return self._states
 
   @states.setter
@@ -437,19 +448,27 @@
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
 
-    if hasattr(self.cell.state_size, '__len__'):
+    if _is_multiple_state(self.cell.state_size):
       state_size = self.cell.state_size
     else:
       state_size = [self.cell.state_size]
-    output_dim = state_size[0]
+
+    if hasattr(self.cell, 'output_size'):
+      output_dim = tensor_shape.as_shape(self.cell.output_size).as_list()
+    else:
+      # Note that state_size[0] could be a tensor_shape or int.
+      output_dim = tensor_shape.as_shape(state_size[0]).as_list()
 
     if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], output_dim)
+      output_shape = tuple([input_shape[0], input_shape[1]] + output_dim)
     else:
-      output_shape = (input_shape[0], output_dim)
+      output_shape = tuple([input_shape[0]] + output_dim)
 
     if self.return_state:
-      state_shape = [(input_shape[0], dim) for dim in state_size]
+      state_shape = [
+          tuple([input_shape[0]] + tensor_shape.as_shape(dim).as_list())
+          for dim in state_size
+      ]
       return [output_shape] + state_shape
     else:
       return output_shape
@@ -477,49 +496,83 @@
       input_shape = input_shape[0]
 
     batch_size = input_shape[0] if self.stateful else None
-    input_dim = input_shape[-1]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
+    input_dim = input_shape[2:]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_dim)
 
     # allow cell (if layer) to build before we set or validate state_spec
     if isinstance(self.cell, Layer):
-      step_input_shape = (input_shape[0],) + input_shape[2:]
+      step_input_shape = (input_shape[0],) + input_dim
       if constants_shape is not None:
         self.cell.build([step_input_shape] + constants_shape)
       else:
         self.cell.build(step_input_shape)
 
     # set or validate state_spec
-    if hasattr(self.cell.state_size, '__len__'):
+    if _is_multiple_state(self.cell.state_size):
       state_size = list(self.cell.state_size)
     else:
       state_size = [self.cell.state_size]
 
     if self.state_spec is not None:
       # initial_state was passed in call, check compatibility
-      if [spec.shape[-1] for spec in self.state_spec] != state_size:
-        raise ValueError(
-            'An `initial_state` was passed that is not compatible with '
-            '`cell.state_size`. Received `state_spec`={}; '
-            'however `cell.state_size` is '
-            '{}'.format(self.state_spec, self.cell.state_size))
+      self._validate_state_spec(state_size, self.state_spec)
     else:
-      self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
+      self.state_spec = [
+          InputSpec(shape=[None] + tensor_shape.as_shape(dim).as_list())
+          for dim in state_size
+      ]
     if self.stateful:
       self.reset_states()
     self.built = True
 
-  def get_initial_state(self, inputs):
-    # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = array_ops.zeros_like(inputs)
-    # shape of initial_state = (samples, timesteps, input_dim)
-    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
-    # shape of initial_state = (samples,)
-    initial_state = array_ops.expand_dims(initial_state, axis=-1)
-    # shape of initial_state = (samples, 1)
-    if hasattr(self.cell.state_size, '__len__'):
-      return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
+  @staticmethod
+  def _validate_state_spec(cell_state_sizes, init_state_specs):
+    """Validate the state spec between the initial_state and the state_size.
+
+    Args:
+      cell_state_sizes: list, the `state_size` attribute from the cell.
+      init_state_specs: list, the `state_spec` from the initial_state that is
+        passed in call()
+
+    Raises:
+      ValueError: When initial state spec is not compatible with the state size.
+    """
+    validation_error = ValueError(
+        'An `initial_state` was passed that is not compatible with '
+        '`cell.state_size`. Received `state_spec`={}; '
+        'however `cell.state_size` is '
+        '{}'.format(init_state_specs, cell_state_sizes))
+    if len(cell_state_sizes) == len(init_state_specs):
+      for i in range(len(cell_state_sizes)):
+        if not tensor_shape.TensorShape(
+            # Ignore the first axis for init_state which is for batch
+            init_state_specs[i].shape[1:]).is_compatible_with(
+                tensor_shape.TensorShape(cell_state_sizes[i])):
+          raise validation_error
     else:
-      return [K.tile(initial_state, [1, self.cell.state_size])]
+      raise validation_error
+
+  def get_initial_state(self, inputs):
+    # build an all-zero tensor of shape (batch, cell.state_size)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (batch, timesteps, ...)
+    initial_state = math_ops.reduce_sum(
+        initial_state, axis=list(range(1, len(inputs.shape))))
+    # shape of initial_state = (batch,)
+    if _is_multiple_state(self.cell.state_size):
+      states = []
+      for dims in self.cell.state_size:
+        state = initial_state
+        flat_dims = tensor_shape.as_shape(dims).as_list()
+        # reshape the state to (batch, 1, 1, ....) and then expand each state.
+        state = array_ops.reshape(state, [-1,] + [1] * len(flat_dims))
+        states.append(K.tile(state, [1] + flat_dims))
+      return states
+    else:
+      flat_dims = tensor_shape.as_shape(self.cell.state_size).as_list()
+      initial_state = array_ops.reshape(
+          initial_state, [-1] + [1] * len(flat_dims))
+      return [K.tile(initial_state, [1] + flat_dims)]
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
     inputs, initial_state, constants = _standardize_args(inputs,
@@ -617,6 +670,8 @@
     if generic_utils.has_arg(self.cell.call, 'training'):
       kwargs['training'] = training
 
+    # TF RNN cells expect single tensor as state instead of list wrapped tensor.
+    is_tf_rnn_cell = getattr(self.cell, '_is_tf_rnn_cell', None) is not None
     if constants:
       if not generic_utils.has_arg(self.cell.call, 'constants'):
         raise ValueError('RNN cell does not support constants')
@@ -624,11 +679,21 @@
       def step(inputs, states):
         constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
         states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-        return self.cell.call(inputs, states, constants=constants, **kwargs)
+
+        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
+        output, new_states = self.cell.call(
+            inputs, states, constants=constants, **kwargs)
+        if not nest.is_sequence(new_states):
+          new_states = [new_states]
+        return output, new_states
     else:
 
       def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
+        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
+        output, new_states = self.cell.call(inputs, states, **kwargs)
+        if not nest.is_sequence(new_states):
+          new_states = [new_states]
+        return output, new_states
 
     last_output, outputs, states = K.rnn(
         step,
@@ -682,19 +747,26 @@
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
-      if hasattr(self.cell.state_size, '__len__'):
+      if _is_multiple_state(self.cell.state_size):
         self.states = [
-            K.zeros((batch_size, dim)) for dim in self.cell.state_size
+            K.zeros([batch_size] + tensor_shape.as_shape(dim).as_list())
+            for dim in self.cell.state_size
         ]
       else:
-        self.states = [K.zeros((batch_size, self.cell.state_size))]
+        self.states = [
+            K.zeros([batch_size] +
+                    tensor_shape.as_shape(self.cell.state_size).as_list())
+        ]
     elif states is None:
-      if hasattr(self.cell.state_size, '__len__'):
+      if _is_multiple_state(self.cell.state_size):
         for state, dim in zip(self.states, self.cell.state_size):
-          K.set_value(state, np.zeros((batch_size, dim)))
+          K.set_value(state,
+                      np.zeros([batch_size] +
+                               tensor_shape.as_shape(dim).as_list()))
       else:
-        K.set_value(self.states[0], np.zeros((batch_size,
-                                              self.cell.state_size)))
+        K.set_value(self.states[0], np.zeros(
+            [batch_size] +
+            tensor_shape.as_shape(self.cell.state_size).as_list()))
     else:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -704,11 +776,12 @@
                          'but it received ' + str(len(states)) +
                          ' state values. Input received: ' + str(states))
       for index, (value, state) in enumerate(zip(states, self.states)):
-        if hasattr(self.cell.state_size, '__len__'):
+        if _is_multiple_state(self.cell.state_size):
           dim = self.cell.state_size[index]
         else:
           dim = self.cell.state_size
-        if value.shape != (batch_size, dim):
+        if value.shape != tuple([batch_size] +
+                                tensor_shape.as_shape(dim).as_list()):
           raise ValueError(
               'State ' + str(index) + ' is incompatible with layer ' +
               self.name + ': expected shape=' + str(
@@ -846,6 +919,7 @@
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_size = self.units
+    self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
@@ -1249,6 +1323,7 @@
     self.implementation = implementation
     self.reset_after = reset_after
     self.state_size = self.units
+    self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
@@ -1794,6 +1869,7 @@
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
     self.state_size = (self.units, self.units)
+    self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
@@ -2272,3 +2348,9 @@
   constants = to_list_or_none(constants)
 
   return inputs, initial_state, constants
+
+
+def _is_multiple_state(state_size):
+  """Check whether the state_size contains multiple states."""
+  return (hasattr(state_size, '__len__') and
+          not isinstance(state_size, tensor_shape.TensorShape))
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index fefb928..13bd070 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -24,8 +24,10 @@
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_util
@@ -573,6 +575,163 @@
       for v in model.variables:
         self.assertIn(v, checkpointed_objects)
 
+  def test_high_dimension_RNN(self):
+    with self.test_session():
+      # Basic test case.
+      unit_a = 10
+      unit_b = 20
+      input_a = 5
+      input_b = 10
+      batch = 32
+      time_step = 4
+
+      cell = Minimal2DRNNCell(unit_a, unit_b)
+      x = keras.Input((None, input_a, input_b))
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+
+      self.assertEqual(cell.state_size.as_list(), [unit_a, unit_b])
+      init_state = layer.get_initial_state(x)
+      self.assertEqual(len(init_state), 1)
+      self.assertEqual(init_state[0].get_shape().as_list(),
+                       [None, unit_a, unit_b])
+
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, input_a, input_b)),
+          np.zeros((batch, unit_a, unit_b)))
+      self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+      # Test stacking.
+      cells = [
+          Minimal2DRNNCell(unit_a, unit_b),
+          Minimal2DRNNCell(unit_a * 2, unit_b * 2),
+          Minimal2DRNNCell(unit_a * 4, unit_b * 4)
+      ]
+      layer = keras.layers.RNN(cells)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, input_a, input_b)),
+          np.zeros((batch, unit_a * 4, unit_b * 4)))
+      self.assertEqual(model.output_shape, (None, unit_a * 4, unit_b * 4))
+
+  def test_high_dimension_RNN_with_init_state(self):
+    unit_a = 10
+    unit_b = 20
+    input_a = 5
+    input_b = 10
+    batch = 32
+    time_step = 4
+
+    with self.test_session():
+      # Basic test case.
+      cell = Minimal2DRNNCell(unit_a, unit_b)
+      x = keras.Input((None, input_a, input_b))
+      s = keras.Input((unit_a, unit_b))
+      layer = keras.layers.RNN(cell)
+      y = layer(x, initial_state=s)
+
+      model = keras.models.Model([x, s], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch([
+          np.zeros((batch, time_step, input_a, input_b)),
+          np.zeros((batch, unit_a, unit_b))
+      ], np.zeros((batch, unit_a, unit_b)))
+      self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+    with self.test_session():
+      # Bad init state shape.
+      bad_shape_a = unit_a * 2
+      bad_shape_b = unit_b * 2
+      cell = Minimal2DRNNCell(unit_a, unit_b)
+      x = keras.Input((None, input_a, input_b))
+      s = keras.Input((bad_shape_a, bad_shape_b))
+      layer = keras.layers.RNN(cell)
+      with self.assertRaisesWithPredicateMatch(ValueError,
+                                               'however `cell.state_size` is'):
+        layer(x, initial_state=s)
+
+  def test_inconsistent_output_state_size(self):
+    with self.test_session():
+      batch = 32
+      time_step = 4
+      state_size = 5
+      input_size = 6
+      cell = PlusOneRNNCell(state_size)
+      x = keras.Input((None, input_size))
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+
+      self.assertEqual(cell.state_size, state_size)
+      init_state = layer.get_initial_state(x)
+      self.assertEqual(len(init_state), 1)
+      self.assertEqual(init_state[0].get_shape().as_list(),
+                       [None, state_size])
+
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          np.zeros((batch, time_step, input_size)),
+          np.zeros((batch, input_size)))
+      self.assertEqual(model.output_shape, (None, input_size))
+
+
+class Minimal2DRNNCell(keras.layers.Layer):
+  """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
+
+  Both internal state and output have 2 dimensions and are orthogonal
+  between each other.
+  """
+
+  def __init__(self, unit_a, unit_b, **kwargs):
+    self.unit_a = unit_a
+    self.unit_b = unit_b
+    self.state_size = tensor_shape.as_shape([unit_a, unit_b])
+    self.output_size = tensor_shape.as_shape([unit_a, unit_b])
+    super(Minimal2DRNNCell, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    input_a = input_shape[-2]
+    input_b = input_shape[-1]
+    self.kernel = self.add_weight(
+        shape=(input_a, input_b, self.unit_a, self.unit_b),
+        initializer='uniform',
+        name='kernel')
+    self.recurring_kernel = self.add_weight(
+        shape=(self.unit_a, self.unit_b, self.unit_a, self.unit_b),
+        initializer='uniform',
+        name='recurring_kernel')
+    self.bias = self.add_weight(
+        shape=(self.unit_a, self.unit_b), initializer='uniform', name='bias')
+    self.built = True
+
+  def call(self, inputs, states):
+    prev_output = states[0]
+    h = special_math_ops.einsum('bij,ijkl->bkl', inputs, self.kernel)
+    h += array_ops.expand_dims(self.bias, axis=0)
+    output = h + special_math_ops.einsum('bij,ijkl->bkl', prev_output,
+                                         self.recurring_kernel)
+    return output, [output]
+
+
+class PlusOneRNNCell(keras.layers.Layer):
+  """Add one to the input and state.
+
+  This cell is used for testing state_size and output_size."""
+
+  def __init__(self, num_unit, **kwargs):
+    self.state_size = num_unit
+    super(PlusOneRNNCell, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    self.output_size = input_shape[-1]
+
+  def call(self, inputs, states):
+    return inputs + 1, [states[0] + 1]
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index f0c1e76..9b8d5fc 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -331,7 +331,7 @@
       inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
       inner_mask = K.reshape(inner_mask, inner_mask_shape)
     input_uid = generic_utils.object_list_uid(inputs)
-    inner_inputs = self._input_map[input_uid]
+    inner_inputs = self._input_map.get(input_uid, inputs)
     output_mask = self.layer.compute_mask(inner_inputs, inner_mask)
     if output_mask is None:
       if mask is None:
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index b18f126..9b87170 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -55,7 +55,7 @@
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -68,25 +68,19 @@
 
 
 def update_state_wrapper(update_state_fn):
-  """Decorator to wrap metric `update_state()` with `defun()`, `add_update()`.
+  """Decorator to wrap metric `update_state()` with `add_update()`.
 
   Args:
     update_state_fn: function that accumulates metric statistics.
 
   Returns:
-    If eager execution is enabled, returns None.
-    If graph execution is enabled, returns an update op. This op should be
-      executed to update the metric state with the given inputs.
+    Decorated function that wraps `update_state_fn()` with `add_update()`.
   """
 
   def decorated(metric_obj, *args, **kwargs):
-    """Decorated function with `defun()` and `add_update()`."""
+    """Decorated function with `add_update()`."""
 
-    # Converting update_state_fn() into a graph function, so that
-    # we can return a single op that performs all of the variable updates.
-    # Assigning to a different method name to avoid reference cycle.
-    defuned_update_state_fn = function.defun(update_state_fn)
-    update_op = defuned_update_state_fn(*args, **kwargs)
+    update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
       check_is_tensor_or_operation(
@@ -111,12 +105,13 @@
     result_fn: function that computes the metric result.
 
   Returns:
-    The metric result tensor.
+    Decorated function that wraps `result_fn()` in distribution strategy
+    `merge_call()`.
   """
 
   def decorated(metric_obj, *args):
     """Decorated function with merge_call."""
-    tower_context = distribute_lib.get_tower_context()
+    tower_context = distribution_strategy_context.get_tower_context()
     if tower_context is None:  # if in cross tower context already
       result_t = result_fn(*args)
     else:
@@ -255,6 +250,28 @@
     print('Final result: ', sess.run(m.result()))
   ```
 
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(64, activation='relu'))
+  model.add(tf.keras.layers.Dense(64, activation='relu'))
+  model.add(tf.keras.layers.Dense(10, activation='softmax'))
+
+  model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
+                loss=tf.keras.losses.categorical_crossentropy,
+                metrics=[tf.keras.metrics.CategoricalAccuracy()])
+
+  data = np.random.random((1000, 32))
+  labels = np.random.random((1000, 10))
+
+  dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+  dataset = dataset.batch(32)
+  dataset = dataset.repeat()
+
+  model.fit(dataset, epochs=10, steps_per_epoch=30)
+  ```
+
   To be implemented by subclasses:
   * `__init__()`: All state variables should be created in this method by
     calling `self.add_weight()` like: `self.var = self.add_weight(...)`
@@ -267,7 +284,7 @@
 
   ```
   class BinaryTruePositives(Metric):
-    def __init__(self, name='binary-true-positives', dtype=None):
+    def __init__(self, name='binary_true_positives', dtype=None):
       super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
       self.true_positives = self.add_weight(
           'true_positives', initializer=init_ops.zeros_initializer)
@@ -299,9 +316,14 @@
     self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
 
   def __new__(cls, *args, **kwargs):
-    obj = super(Metric, cls).__new__(cls, *args, **kwargs)
+    obj = super(Metric, cls).__new__(cls)
+    # TODO(psv): Fix reference cycle issue here.
+
+    # Converting update_state_fn() into a graph function, so that
+    # we can return a single op that performs all of the variable updates.
+    defuned_update_state_fn = function.defun(obj.update_state)
     obj.update_state = types.MethodType(
-        update_state_wrapper(obj.update_state), obj)
+        update_state_wrapper(defuned_update_state_fn), obj)
     obj.result = types.MethodType(result_wrapper(obj.result), obj)
     return obj
 
@@ -359,6 +381,12 @@
     """
     NotImplementedError('Must be implemented in subclasses.')
 
+  @classmethod
+  def from_config(cls, config):
+    if 'trainable' in config:
+      config.pop('trainable')
+    return cls(**config)
+
   ### For use by subclasses ###
   def add_weight(self,
                  name,
@@ -502,7 +530,7 @@
   Use `sample_weight` of 0 to mask values.
   """
 
-  def __init__(self, name='binary-accuracy', dtype=None, threshold=0.5):
+  def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
     """Creates a `BinaryAccuracy` instance.
 
     Args:
@@ -515,6 +543,29 @@
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
 
+class CategoricalAccuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `categorical accuracy`: an idempotent operation that
+  simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, name='categorical_accuracy', dtype=None):
+    """Creates a `CategoricalAccuracy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(CategoricalAccuracy, self).__init__(
+        categorical_accuracy, name, dtype=dtype)
+
+
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred, threshold=0.5):
   threshold = math_ops.cast(threshold, y_pred.dtype)
@@ -578,8 +629,7 @@
 @tf_export('keras.metrics.get')
 def get(identifier):
   if isinstance(identifier, dict):
-    config = {'class_name': str(identifier), 'config': {}}
-    return deserialize(config)
+    return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
     return deserialize(str(identifier))
   elif callable(identifier):
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 49f3ae4..2ac7421 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -363,6 +363,30 @@
     self.assertAlmostEqual(result, 0.5, 2)
 
   @test_util.run_in_graph_and_eager_modes
+  def test_categorical_accuracy(self):
+    acc_obj = metrics.CategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[0, 0, 1], [0, 1, 0]],
+                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[0, 0, 1], [0, 1, 0]],
+                       [[0.1, 0.1, 0.8], [0.05, 0, 0.95]], [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+  @test_util.run_in_graph_and_eager_modes
   def test_invalid_result(self):
 
     class InvalidResult(metrics.Metric):
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 6cbea45..71c1987 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -425,9 +425,10 @@
     model = SimpleTestModel(num_classes=num_classes,
                             use_dp=True,
                             use_bn=True)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        metrics=['acc', keras.metrics.CategoricalAccuracy()])
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 4f97442..f339a7e 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -28,7 +28,7 @@
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -705,7 +705,7 @@
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribute_lib.has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       self.updates = []
 
       if not params:
diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py
index e6704ee..2f08f88 100644
--- a/tensorflow/python/keras/preprocessing/__init__.py
+++ b/tensorflow/python/keras/preprocessing/__init__.py
@@ -13,10 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 """Keras data preprocessing utils."""
+# pylint: disable=g-import-not-at-top
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import keras_preprocessing
+
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import utils
+
+keras_preprocessing.set_keras_submodules(backend=backend, utils=utils)
+
 from tensorflow.python.keras.preprocessing import image
 from tensorflow.python.keras.preprocessing import sequence
 from tensorflow.python.keras.preprocessing import text
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index aa425df..ba22738 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -12,1588 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
 # pylint: disable=g-import-not-at-top
-"""Fairly basic set of tools for real-time data augmentation on image data.
-
-Can easily be extended to include new transformations,
-new preprocessing methods, etc...
+"""Set of tools for real-time data augmentation on image data.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from functools import partial
-import multiprocessing.pool
-import os
-import re
-import threading
+from keras_preprocessing import image
+try:
+  from scipy import linalg  # pylint: disable=unused-import
+  from scipy import ndimage  # pylint: disable=unused-import
+except ImportError:
+  pass
 
-import numpy as np
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils.data_utils import Sequence
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from scipy import linalg
-  import scipy.ndimage as ndi
-except ImportError:
-  linalg = None
-  ndi = None
+random_rotation = image.random_rotation
+random_shift = image.random_shift
+random_shear = image.random_shear
+random_zoom = image.random_zoom
+apply_channel_shift = image.apply_channel_shift
+random_channel_shift = image.random_channel_shift
+apply_brightness_shift = image.apply_brightness_shift
+random_brightness = image.random_brightness
+apply_affine_transform = image.apply_affine_transform
+array_to_img = image.array_to_img
+img_to_array = image.img_to_array
+save_img = image.save_img
+load_img = image.load_img
+ImageDataGenerator = image.ImageDataGenerator
+Iterator = image.Iterator
+NumpyArrayIterator = image.NumpyArrayIterator
+DirectoryIterator = image.DirectoryIterator
 
-
-try:
-  from PIL import ImageEnhance
-  from PIL import Image as pil_image
-except ImportError:
-  pil_image = None
-
-if pil_image is not None:
-  _PIL_INTERPOLATION_METHODS = {
-      'nearest': pil_image.NEAREST,
-      'bilinear': pil_image.BILINEAR,
-      'bicubic': pil_image.BICUBIC,
-  }
-  # These methods were only introduced in version 3.4.0 (2016).
-  if hasattr(pil_image, 'HAMMING'):
-    _PIL_INTERPOLATION_METHODS['hamming'] = pil_image.HAMMING
-  if hasattr(pil_image, 'BOX'):
-    _PIL_INTERPOLATION_METHODS['box'] = pil_image.BOX
-  # This method is new in version 1.1.3 (2013).
-  if hasattr(pil_image, 'LANCZOS'):
-    _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS
-
-
-@tf_export('keras.preprocessing.image.random_rotation')
-def random_rotation(x,
-                    rg,
-                    row_axis=1,
-                    col_axis=2,
-                    channel_axis=0,
-                    fill_mode='nearest',
-                    cval=0.):
-  """Performs a random rotation of a Numpy image tensor.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      rg: Rotation range, in degrees.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-
-  Returns:
-      Rotated Numpy image tensor.
-  """
-  theta = np.deg2rad(np.random.uniform(-rg, rg))
-  rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                              [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
-
-  h, w = x.shape[row_axis], x.shape[col_axis]
-  transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
-  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
-  return x
-
-
-@tf_export('keras.preprocessing.image.random_shift')
-def random_shift(x,
-                 wrg,
-                 hrg,
-                 row_axis=1,
-                 col_axis=2,
-                 channel_axis=0,
-                 fill_mode='nearest',
-                 cval=0.):
-  """Performs a random spatial shift of a Numpy image tensor.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      wrg: Width shift range, as a float fraction of the width.
-      hrg: Height shift range, as a float fraction of the height.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-
-  Returns:
-      Shifted Numpy image tensor.
-  """
-  h, w = x.shape[row_axis], x.shape[col_axis]
-  tx = np.random.uniform(-hrg, hrg) * h
-  ty = np.random.uniform(-wrg, wrg) * w
-  translation_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-
-  transform_matrix = translation_matrix  # no need to do offset
-  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
-  return x
-
-
-@tf_export('keras.preprocessing.image.random_shear')
-def random_shear(x,
-                 intensity,
-                 row_axis=1,
-                 col_axis=2,
-                 channel_axis=0,
-                 fill_mode='nearest',
-                 cval=0.):
-  """Performs a random spatial shear of a Numpy image tensor.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      intensity: Transformation intensity in degrees.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-
-  Returns:
-      Sheared Numpy image tensor.
-  """
-  shear = np.deg2rad(np.random.uniform(-intensity, intensity))
-  shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
-                           [0, 0, 1]])
-
-  h, w = x.shape[row_axis], x.shape[col_axis]
-  transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
-  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
-  return x
-
-
-@tf_export('keras.preprocessing.image.random_zoom')
-def random_zoom(x,
-                zoom_range,
-                row_axis=1,
-                col_axis=2,
-                channel_axis=0,
-                fill_mode='nearest',
-                cval=0.):
-  """Performs a random spatial zoom of a Numpy image tensor.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      zoom_range: Tuple of floats; zoom range for width and height.
-      row_axis: Index of axis for rows in the input tensor.
-      col_axis: Index of axis for columns in the input tensor.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-
-  Returns:
-      Zoomed Numpy image tensor.
-
-  Raises:
-      ValueError: if `zoom_range` isn't a tuple.
-  """
-  if len(zoom_range) != 2:
-    raise ValueError('`zoom_range` should be a tuple or list of two floats. '
-                     'Received arg: ', zoom_range)
-
-  if zoom_range[0] == 1 and zoom_range[1] == 1:
-    zx, zy = 1, 1
-  else:
-    zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
-  zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
-
-  h, w = x.shape[row_axis], x.shape[col_axis]
-  transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
-  x = apply_transform(x, transform_matrix, channel_axis, fill_mode, cval)
-  return x
-
-
-@tf_export('keras.preprocessing.image.random_channel_shift')
-def random_channel_shift(x, intensity, channel_axis=0):
-  """Perform a random channel shift.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      intensity: Transformation intensity.
-      channel_axis: Index of axis for channels in the input tensor.
-
-  Returns:
-      Numpy image tensor.
-  """
-  x = np.rollaxis(x, channel_axis, 0)
-  min_x, max_x = np.min(x), np.max(x)
-  channel_images = [
-      np.clip(x_channel + np.random.uniform(-intensity, intensity), min_x,
-              max_x) for x_channel in x
-  ]
-  x = np.stack(channel_images, axis=0)
-  x = np.rollaxis(x, 0, channel_axis + 1)
-  return x
-
-
-@tf_export('keras.preprocessing.image.random_brightness')
-def random_brightness(x, brightness_range):
-  """Performs a random adjustment of brightness of a Numpy image tensor.
-
-  Arguments:
-      x: Input tensor. Must be 3D.
-      brightness_range: Tuple of floats; range to pick a brightness value from.
-
-  Returns:
-      Brightness adjusted Numpy image tensor.
-
-  Raises:
-      ValueError: if `brightness_range` isn't a tuple.
-  """
-  if len(brightness_range) != 2:
-    raise ValueError('`brightness_range should be tuple or list of two floats. '
-                     'Received arg: ', brightness_range)
-
-  x = array_to_img(x)
-  x = ImageEnhance.Brightness(x)
-  u = np.random.uniform(brightness_range[0], brightness_range[1])
-  x = x.enhance(u)
-  x = img_to_array(x)
-  return x
-
-
-def transform_matrix_offset_center(matrix, x, y):
-  o_x = float(x) / 2 + 0.5
-  o_y = float(y) / 2 + 0.5
-  offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
-  reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
-  transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
-  return transform_matrix
-
-
-@tf_export('keras.preprocessing.image.apply_transform')
-def apply_transform(x,
-                    transform_matrix,
-                    channel_axis=0,
-                    fill_mode='nearest',
-                    cval=0.):
-  """Apply the image transformation specified by a matrix.
-
-  Arguments:
-      x: 2D numpy array, single image.
-      transform_matrix: Numpy array specifying the geometric transformation.
-      channel_axis: Index of axis for channels in the input tensor.
-      fill_mode: Points outside the boundaries of the input
-          are filled according to the given mode
-          (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
-      cval: Value used for points outside the boundaries
-          of the input if `mode='constant'`.
-
-  Returns:
-      The transformed version of the input.
-  """
-  x = np.rollaxis(x, channel_axis, 0)
-  final_affine_matrix = transform_matrix[:2, :2]
-  final_offset = transform_matrix[:2, 2]
-  channel_images = [
-      ndi.interpolation.affine_transform(
-          x_channel,
-          final_affine_matrix,
-          final_offset,
-          order=1,
-          mode=fill_mode,
-          cval=cval) for x_channel in x
-  ]
-  x = np.stack(channel_images, axis=0)
-  x = np.rollaxis(x, 0, channel_axis + 1)
-  return x
-
-
-@tf_export('keras.preprocessing.image.flip_axis')
-def flip_axis(x, axis):
-  x = np.asarray(x).swapaxes(axis, 0)
-  x = x[::-1, ...]
-  x = x.swapaxes(0, axis)
-  return x
-
-
-@tf_export('keras.preprocessing.image.array_to_img')
-def array_to_img(x, data_format=None, scale=True):
-  """Converts a 3D Numpy array to a PIL Image instance.
-
-  Arguments:
-      x: Input Numpy array.
-      data_format: Image data format.
-      scale: Whether to rescale image values
-          to be within [0, 255].
-
-  Returns:
-      A PIL Image instance.
-
-  Raises:
-      ImportError: if PIL is not available.
-      ValueError: if invalid `x` or `data_format` is passed.
-  """
-  if pil_image is None:
-    raise ImportError('Could not import PIL.Image. '
-                      'The use of `array_to_img` requires PIL.')
-  x = np.asarray(x, dtype=K.floatx())
-  if x.ndim != 3:
-    raise ValueError('Expected image array to have rank 3 (single image). '
-                     'Got array with shape:', x.shape)
-
-  if data_format is None:
-    data_format = K.image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Invalid data_format:', data_format)
-
-  # Original Numpy array x has format (height, width, channel)
-  # or (channel, height, width)
-  # but target PIL image has format (width, height, channel)
-  if data_format == 'channels_first':
-    x = x.transpose(1, 2, 0)
-  if scale:
-    x = x + max(-np.min(x), 0)  # pylint: disable=g-no-augmented-assignment
-    x_max = np.max(x)
-    if x_max != 0:
-      x /= x_max
-    x *= 255
-  if x.shape[2] == 3:
-    # RGB
-    return pil_image.fromarray(x.astype('uint8'), 'RGB')
-  elif x.shape[2] == 1:
-    # grayscale
-    return pil_image.fromarray(x[:, :, 0].astype('uint8'), 'L')
-  else:
-    raise ValueError('Unsupported channel number: ', x.shape[2])
-
-
-@tf_export('keras.preprocessing.image.img_to_array')
-def img_to_array(img, data_format=None):
-  """Converts a PIL Image instance to a Numpy array.
-
-  Arguments:
-      img: PIL Image instance.
-      data_format: Image data format.
-
-  Returns:
-      A 3D Numpy array.
-
-  Raises:
-      ValueError: if invalid `img` or `data_format` is passed.
-  """
-  if data_format is None:
-    data_format = K.image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
-    raise ValueError('Unknown data_format: ', data_format)
-  # Numpy array x has format (height, width, channel)
-  # or (channel, height, width)
-  # but original PIL image has format (width, height, channel)
-  x = np.asarray(img, dtype=K.floatx())
-  if len(x.shape) == 3:
-    if data_format == 'channels_first':
-      x = x.transpose(2, 0, 1)
-  elif len(x.shape) == 2:
-    if data_format == 'channels_first':
-      x = x.reshape((1, x.shape[0], x.shape[1]))
-    else:
-      x = x.reshape((x.shape[0], x.shape[1], 1))
-  else:
-    raise ValueError('Unsupported image shape: ', x.shape)
-  return x
-
-
-@tf_export('keras.preprocessing.image.load_img')
-def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
-  """Loads an image into PIL format.
-
-  Arguments:
-      path: Path to image file
-      grayscale: Boolean, whether to load the image as grayscale.
-      target_size: Either `None` (default to original size)
-          or tuple of ints `(img_height, img_width)`.
-      interpolation: Interpolation method used to resample the image if the
-          target size is different from that of the loaded image.
-          Supported methods are "nearest", "bilinear", and "bicubic".
-          If PIL version 1.1.3 or newer is installed, "lanczos" is also
-          supported. If PIL version 3.4.0 or newer is installed, "box" and
-          "hamming" are also supported. By default, "nearest" is used.
-
-  Returns:
-      A PIL Image instance.
-
-  Raises:
-      ImportError: if PIL is not available.
-      ValueError: if interpolation method is not supported.
-  """
-  if pil_image is None:
-    raise ImportError('Could not import PIL.Image. '
-                      'The use of `array_to_img` requires PIL.')
-  img = pil_image.open(path)
-  if grayscale:
-    if img.mode != 'L':
-      img = img.convert('L')
-  else:
-    if img.mode != 'RGB':
-      img = img.convert('RGB')
-  if target_size is not None:
-    width_height_tuple = (target_size[1], target_size[0])
-    if img.size != width_height_tuple:
-      if interpolation not in _PIL_INTERPOLATION_METHODS:
-        raise ValueError('Invalid interpolation method {} specified. Supported '
-                         'methods are {}'.format(interpolation, ', '.join(
-                             _PIL_INTERPOLATION_METHODS.keys())))
-      resample = _PIL_INTERPOLATION_METHODS[interpolation]
-      img = img.resize(width_height_tuple, resample)
-  return img
-
-
-def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
-  return [
-      os.path.join(root, f)
-      for root, _, files in os.walk(directory)
-      for f in files
-      if re.match(r'([\w]+\.(?:' + ext + '))', f)
-  ]
-
-
-@tf_export('keras.preprocessing.image.ImageDataGenerator')
-class ImageDataGenerator(object):
-  """Generates batches of tensor image data with real-time data augmentation.
-  The data will be looped over (in batches).
-
-  Arguments:
-      featurewise_center: boolean, set input mean to 0 over the dataset,
-          feature-wise.
-      samplewise_center: boolean, set each sample mean to 0.
-      featurewise_std_normalization: boolean, divide inputs by std
-          of the dataset, feature-wise.
-      samplewise_std_normalization: boolean, divide each input by its std.
-      zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
-      zca_whitening: boolean, apply ZCA whitening.
-      rotation_range: int, degree range for random rotations.
-      width_shift_range: float, 1-D array-like or int
-          float: fraction of total width, if < 1, or pixels if >= 1.
-          1-D array-like: random elements from the array.
-          int: integer number of pixels from interval
-              `(-width_shift_range, +width_shift_range)`
-          With `width_shift_range=2` possible values are integers [-1, 0, +1],
-          same as with `width_shift_range=[-1, 0, +1]`,
-          while with `width_shift_range=1.0` possible values are floats in
-          the interval [-1.0, +1.0).
-      shear_range: float, shear Intensity
-          (Shear angle in counter-clockwise direction in degrees)
-      zoom_range: float or [lower, upper], Range for random zoom.
-          If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
-      channel_shift_range: float, range for random channel shifts.
-      fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
-          Default is 'nearest'. Points outside the boundaries of the input
-          are filled according to the given mode:
-              'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
-              'nearest':  aaaaaaaa|abcd|dddddddd
-              'reflect':  abcddcba|abcd|dcbaabcd
-              'wrap':  abcdabcd|abcd|abcdabcd
-      cval: float or int, value used for points outside the boundaries
-          when `fill_mode = "constant"`.
-      horizontal_flip: boolean, randomly flip inputs horizontally.
-      vertical_flip: boolean, randomly flip inputs vertically.
-      rescale: rescaling factor. Defaults to None. If None or 0, no rescaling
-          is applied, otherwise we multiply the data by the value provided
-          (before applying any other transformation).
-      preprocessing_function: function that will be implied on each input.
-          The function will run after the image is resized and augmented.
-          The function should take one argument:
-          one image (Numpy tensor with rank 3),
-          and should output a Numpy tensor with the same shape.
-      data_format: One of {"channels_first", "channels_last"}.
-          "channels_last" mode means that the images should have shape
-              `(samples, height, width, channels)`,
-          "channels_first" mode means that the images should have shape
-              `(samples, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-              Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      validation_split: float, fraction of images reserved for validation
-          (strictly between 0 and 1).
-
-  Examples:
-      Example of using `.flow(x, y)`:
-      ```python
-      (x_train, y_train), (x_test, y_test) = cifar10.load_data()
-      y_train = np_utils.to_categorical(y_train, num_classes)
-      y_test = np_utils.to_categorical(y_test, num_classes)
-      datagen = ImageDataGenerator(
-          featurewise_center=True,
-          featurewise_std_normalization=True,
-          rotation_range=20,
-          width_shift_range=0.2,
-          height_shift_range=0.2,
-          horizontal_flip=True)
-      # compute quantities required for featurewise normalization
-      # (std, mean, and principal components if ZCA whitening is applied)
-      datagen.fit(x_train)
-      # fits the model on batches with real-time data augmentation:
-      model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
-                          steps_per_epoch=len(x_train) / 32, epochs=epochs)
-      # here's a more "manual" example
-      for e in range(epochs):
-          print('Epoch', e)
-          batches = 0
-          for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
-              model.fit(x_batch, y_batch)
-              batches += 1
-              if batches >= len(x_train) / 32:
-                  # we need to break the loop by hand because
-                  # the generator loops indefinitely
-                  break
-      ```
-      Example of using `.flow_from_directory(directory)`:
-      ```python
-      train_datagen = ImageDataGenerator(
-          rescale=1./255,
-          shear_range=0.2,
-          zoom_range=0.2,
-          horizontal_flip=True)
-      test_datagen = ImageDataGenerator(rescale=1./255)
-      train_generator = train_datagen.flow_from_directory(
-          'data/train',
-          target_size=(150, 150),
-          batch_size=32,
-          class_mode='binary')
-      validation_generator = test_datagen.flow_from_directory(
-          'data/validation',
-          target_size=(150, 150),
-          batch_size=32,
-          class_mode='binary')
-      model.fit_generator(
-          train_generator,
-          steps_per_epoch=2000,
-          epochs=50,
-          validation_data=validation_generator,
-          validation_steps=800)
-      ```
-      Example of transforming images and masks together.
-      ```python
-      # we create two instances with the same arguments
-      data_gen_args = dict(featurewise_center=True,
-                           featurewise_std_normalization=True,
-                           rotation_range=90.,
-                           width_shift_range=0.1,
-                           height_shift_range=0.1,
-                           zoom_range=0.2)
-      image_datagen = ImageDataGenerator(**data_gen_args)
-      mask_datagen = ImageDataGenerator(**data_gen_args)
-      # Provide the same seed and keyword arguments to the fit and flow methods
-      seed = 1
-      image_datagen.fit(images, augment=True, seed=seed)
-      mask_datagen.fit(masks, augment=True, seed=seed)
-      image_generator = image_datagen.flow_from_directory(
-          'data/images',
-          class_mode=None,
-          seed=seed)
-      mask_generator = mask_datagen.flow_from_directory(
-          'data/masks',
-          class_mode=None,
-          seed=seed)
-      # combine generators into one which yields image and masks
-      train_generator = zip(image_generator, mask_generator)
-      model.fit_generator(
-          train_generator,
-          steps_per_epoch=2000,
-          epochs=50)
-      ```
-  """
-
-  def __init__(self,
-               featurewise_center=False,
-               samplewise_center=False,
-               featurewise_std_normalization=False,
-               samplewise_std_normalization=False,
-               zca_whitening=False,
-               zca_epsilon=1e-6,
-               rotation_range=0.,
-               width_shift_range=0.,
-               height_shift_range=0.,
-               brightness_range=None,
-               shear_range=0.,
-               zoom_range=0.,
-               channel_shift_range=0.,
-               fill_mode='nearest',
-               cval=0.,
-               horizontal_flip=False,
-               vertical_flip=False,
-               rescale=None,
-               preprocessing_function=None,
-               data_format=None,
-               validation_split=0.0):
-    if data_format is None:
-      data_format = K.image_data_format()
-    self.featurewise_center = featurewise_center
-    self.samplewise_center = samplewise_center
-    self.featurewise_std_normalization = featurewise_std_normalization
-    self.samplewise_std_normalization = samplewise_std_normalization
-    self.zca_whitening = zca_whitening
-    self.zca_epsilon = zca_epsilon
-    self.rotation_range = rotation_range
-    self.width_shift_range = width_shift_range
-    self.height_shift_range = height_shift_range
-    self.brightness_range = brightness_range
-    self.shear_range = shear_range
-    self.zoom_range = zoom_range
-    self.channel_shift_range = channel_shift_range
-    self.fill_mode = fill_mode
-    self.cval = cval
-    self.horizontal_flip = horizontal_flip
-    self.vertical_flip = vertical_flip
-    self.rescale = rescale
-    self.preprocessing_function = preprocessing_function
-
-    if data_format not in {'channels_last', 'channels_first'}:
-      raise ValueError(
-          '`data_format` should be `"channels_last"` (channel after row and '
-          'column) or `"channels_first"` (channel before row and column). '
-          'Received arg: ', data_format)
-    self.data_format = data_format
-    if data_format == 'channels_first':
-      self.channel_axis = 1
-      self.row_axis = 2
-      self.col_axis = 3
-    if data_format == 'channels_last':
-      self.channel_axis = 3
-      self.row_axis = 1
-      self.col_axis = 2
-    if validation_split and not 0 < validation_split < 1:
-      raise ValueError('`validation_split` must be strictly between 0 and 1. '
-                       'Received arg: ', validation_split)
-    self.validation_split = validation_split
-
-    self.mean = None
-    self.std = None
-    self.principal_components = None
-
-    if np.isscalar(zoom_range):
-      self.zoom_range = [1 - zoom_range, 1 + zoom_range]
-    elif len(zoom_range) == 2:
-      self.zoom_range = [zoom_range[0], zoom_range[1]]
-    else:
-      raise ValueError('`zoom_range` should be a float or '
-                       'a tuple or list of two floats. '
-                       'Received arg: ', zoom_range)
-    if zca_whitening:
-      if not featurewise_center:
-        self.featurewise_center = True
-        logging.warning('This ImageDataGenerator specifies '
-                        '`zca_whitening`, which overrides '
-                        'setting of `featurewise_center`.')
-      if featurewise_std_normalization:
-        self.featurewise_std_normalization = False
-        logging.warning('This ImageDataGenerator specifies '
-                        '`zca_whitening` '
-                        'which overrides setting of'
-                        '`featurewise_std_normalization`.')
-    if featurewise_std_normalization:
-      if not featurewise_center:
-        self.featurewise_center = True
-        logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_std_normalization`, '
-                        'which overrides setting of '
-                        '`featurewise_center`.')
-    if samplewise_std_normalization:
-      if not samplewise_center:
-        self.samplewise_center = True
-        logging.warning('This ImageDataGenerator specifies '
-                        '`samplewise_std_normalization`, '
-                        'which overrides setting of '
-                        '`samplewise_center`.')
-
-  def flow(self,
-           x,
-           y=None,
-           batch_size=32,
-           shuffle=True,
-           seed=None,
-           save_to_dir=None,
-           save_prefix='',
-           save_format='png',
-           subset=None):
-    """Generates batches of augmented/normalized data with given numpy arrays.
-
-    Arguments:
-        x: data. Should have rank 4.
-            In case of grayscale data, the channels axis should have value 1
-            and in case of RGB data, it should have value 3.
-        y: labels.
-        batch_size: int (default: 32).
-        shuffle: boolean (default: True).
-        seed: int (default: None).
-        save_to_dir: None or str (default: None).
-            This allows you to optionally specify a directory
-            to which to save the augmented pictures being generated
-            (useful for visualizing what you are doing).
-        save_prefix: str (default: `''`). Prefix to use for filenames of
-            saved pictures (only relevant if `save_to_dir` is set).
-        save_format: one of "png", "jpeg". Default: "png".
-            (only relevant if `save_to_dir` is set)
-        subset: Subset of data (`"training"` or `"validation"`) if
-            `validation_split` is set in `ImageDataGenerator`.
-
-    Returns:
-        An Iterator yielding tuples of `(x, y)` where `x` is a numpy array of
-          image data and `y` is a numpy array of corresponding labels.
-    """
-    return NumpyArrayIterator(
-        x,
-        y,
-        self,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        seed=seed,
-        data_format=self.data_format,
-        save_to_dir=save_to_dir,
-        save_prefix=save_prefix,
-        save_format=save_format,
-        subset=subset)
-
-  def flow_from_directory(self,
-                          directory,
-                          target_size=(256, 256),
-                          color_mode='rgb',
-                          classes=None,
-                          class_mode='categorical',
-                          batch_size=32,
-                          shuffle=True,
-                          seed=None,
-                          save_to_dir=None,
-                          save_prefix='',
-                          save_format='png',
-                          follow_links=False,
-                          subset=None,
-                          interpolation='nearest'):
-    """Generates batches of augmented/normalized data given directory path.
-
-    Arguments:
-        directory: path to the target directory. It should contain one
-            subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images
-            inside each of the subdirectories directory tree will be included
-            in the generator. See [this script]
-            (https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
-            for more details.
-        target_size: tuple of integers `(height, width)`, default: `(256,
-            256)`. The dimensions to which all images found will be resized.
-        color_mode: one of "grayscale", "rbg". Default: "rgb". Whether the
-            images will be converted to have 1 or 3 color channels.
-        classes: optional list of class subdirectories (e.g. `['dogs',
-            'cats']`). Default: None. If not provided, the list of classes
-            will be automatically inferred from the subdirectory
-            names/structure under `directory`, where each subdirectory will be
-            treated as a different class (and the order of the classes, which
-            will map to the label indices, will be alphanumeric). The
-            dictionary containing the mapping from class names to class
-            indices can be obtained via the attribute `class_indices`.
-        class_mode: one of "categorical", "binary", "sparse", "input" or
-            None. Default: "categorical". Determines the type of label arrays
-            that are returned: "categorical" will be 2D one-hot encoded
-            labels, "binary" will be 1D binary labels, "sparse" will be 1D
-            integer labels, "input" will be images identical to input images
-            (mainly used to work with autoencoders). If None, no labels are
-            returned (the generator will only yield batches of image data,
-            which is useful to use `model.predict_generator()`,
-            `model.evaluate_generator()`, etc.). Please note that in case of
-            class_mode None, the data still needs to reside in a subdirectory
-            of `directory` for it to work correctly.
-        batch_size: size of the batches of data (default: 32).
-        shuffle: whether to shuffle the data (default: True)
-        seed: optional random seed for shuffling and transformations.
-        save_to_dir: None or str (default: None). This allows you to
-            optionally specify a directory to which to save the augmented
-            pictures being generated (useful for visualizing what you are doing)
-        save_prefix: str. Prefix to use for filenames of saved pictures
-            (only relevant if `save_to_dir` is set).
-        save_format: one of "png", "jpeg" (only relevant if `save_to_dir` is
-            set). Default: "png".
-        follow_links: whether to follow symlinks inside class subdirectories
-            (default: False).
-        subset: Subset of data (`"training"` or `"validation"`) if
-          ` validation_split` is set in `ImageDataGenerator`.
-        interpolation: Interpolation method used to resample the image if
-            the target size is different from that of the loaded image.
-            Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
-            If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
-            supported. If PIL version 3.4.0 or newer is installed, `"box"` and
-            `"hamming"` are also supported. By default, `"nearest"` is used.
-
-    Returns:
-        A DirectoryIterator yielding tuples of `(x, y)` where `x` is a
-        numpy array containing a batch of images with shape
-        `(batch_size, *target_size, channels)` and `y` is a numpy
-        array of corresponding labels.
-    """
-    return DirectoryIterator(
-        directory,
-        self,
-        target_size=target_size,
-        color_mode=color_mode,
-        classes=classes,
-        class_mode=class_mode,
-        data_format=self.data_format,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        seed=seed,
-        save_to_dir=save_to_dir,
-        save_prefix=save_prefix,
-        save_format=save_format,
-        follow_links=follow_links,
-        subset=subset,
-        interpolation=interpolation)
-
-  def standardize(self, x):
-    """Apply the normalization configuration to a batch of inputs.
-
-    Arguments:
-        x: batch of inputs to be normalized.
-
-    Returns:
-        The inputs, normalized.
-    """
-    if self.preprocessing_function:
-      x = self.preprocessing_function(x)
-    if self.rescale:
-      x *= self.rescale
-    if self.samplewise_center:
-      x -= np.mean(x, keepdims=True)
-    if self.samplewise_std_normalization:
-      x /= (np.std(x, keepdims=True) + K.epsilon())
-
-    if self.featurewise_center:
-      if self.mean is not None:
-        x -= self.mean
-      else:
-        logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_center`, but it hasn\'t '
-                        'been fit on any training data. Fit it '
-                        'first by calling `.fit(numpy_data)`.')
-    if self.featurewise_std_normalization:
-      if self.std is not None:
-        x /= (self.std + K.epsilon())
-      else:
-        logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_std_normalization`, but it hasn\'t '
-                        'been fit on any training data. Fit it '
-                        'first by calling `.fit(numpy_data)`.')
-    if self.zca_whitening:
-      if self.principal_components is not None:
-        flatx = np.reshape(x, (-1, np.prod(x.shape[-3:])))
-        whitex = np.dot(flatx, self.principal_components)
-        x = np.reshape(whitex, x.shape)
-      else:
-        logging.warning('This ImageDataGenerator specifies '
-                        '`zca_whitening`, but it hasn\'t '
-                        'been fit on any training data. Fit it '
-                        'first by calling `.fit(numpy_data)`.')
-    return x
-
-  def random_transform(self, x, seed=None):
-    """Randomly augment a single image tensor.
-
-    Arguments:
-        x: 3D tensor, single image.
-        seed: random seed.
-
-    Returns:
-        A randomly transformed version of the input (same shape).
-
-    Raises:
-        ImportError: if Scipy is not available.
-    """
-    if ndi is None:
-      raise ImportError('Scipy is required for image transformations.')
-    # x is a single image, so it doesn't have image number at index 0
-    img_row_axis = self.row_axis - 1
-    img_col_axis = self.col_axis - 1
-    img_channel_axis = self.channel_axis - 1
-
-    if seed is not None:
-      np.random.seed(seed)
-
-    # use composition of homographies
-    # to generate final transform that needs to be applied
-    if self.rotation_range:
-      theta = np.deg2rad(
-          np.random.uniform(-self.rotation_range, self.rotation_range))
-    else:
-      theta = 0
-
-    if self.height_shift_range:
-      try:  # 1-D array-like or int
-        tx = np.random.choice(self.height_shift_range)
-        tx *= np.random.choice([-1, 1])
-      except ValueError:  # floating point
-        tx = np.random.uniform(-self.height_shift_range,
-                               self.height_shift_range)
-      if np.max(self.height_shift_range) < 1:
-        tx *= x.shape[img_row_axis]
-    else:
-      tx = 0
-
-    if self.width_shift_range:
-      try:  # 1-D array-like or int
-        ty = np.random.choice(self.width_shift_range)
-        ty *= np.random.choice([-1, 1])
-      except ValueError:  # floating point
-        ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
-      if np.max(self.width_shift_range) < 1:
-        ty *= x.shape[img_col_axis]
-    else:
-      ty = 0
-
-    if self.shear_range:
-      shear = np.deg2rad(np.random.uniform(-self.shear_range, self.shear_range))
-    else:
-      shear = 0
-
-    if self.zoom_range[0] == 1 and self.zoom_range[1] == 1:
-      zx, zy = 1, 1
-    else:
-      zx, zy = np.random.uniform(self.zoom_range[0], self.zoom_range[1], 2)
-
-    transform_matrix = None
-    if theta != 0:
-      rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                                  [np.sin(theta),
-                                   np.cos(theta), 0], [0, 0, 1]])
-      transform_matrix = rotation_matrix
-
-    if tx != 0 or ty != 0:
-      shift_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
-      transform_matrix = shift_matrix if transform_matrix is None else np.dot(
-          transform_matrix, shift_matrix)
-
-    if shear != 0:
-      shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
-                               [0, 0, 1]])
-      transform_matrix = shear_matrix if transform_matrix is None else np.dot(
-          transform_matrix, shear_matrix)
-
-    if zx != 1 or zy != 1:
-      zoom_matrix = np.array([[zx, 0, 0], [0, zy, 0], [0, 0, 1]])
-      transform_matrix = zoom_matrix if transform_matrix is None else np.dot(
-          transform_matrix, zoom_matrix)
-
-    if transform_matrix is not None:
-      h, w = x.shape[img_row_axis], x.shape[img_col_axis]
-      transform_matrix = transform_matrix_offset_center(transform_matrix, h, w)
-      x = apply_transform(
-          x,
-          transform_matrix,
-          img_channel_axis,
-          fill_mode=self.fill_mode,
-          cval=self.cval)
-
-    if self.channel_shift_range != 0:
-      x = random_channel_shift(x, self.channel_shift_range, img_channel_axis)
-    if self.horizontal_flip:
-      if np.random.random() < 0.5:
-        x = flip_axis(x, img_col_axis)
-
-    if self.vertical_flip:
-      if np.random.random() < 0.5:
-        x = flip_axis(x, img_row_axis)
-
-    if self.brightness_range is not None:
-      x = random_brightness(x, self.brightness_range)
-
-    return x
-
-  def fit(self, x, augment=False, rounds=1, seed=None):
-    """Computes the internal data statistics based on an array of sample data.
-
-    These are statistics related to the data-dependent transformations.
-    Only required if featurewise_center or featurewise_std_normalization or
-    zca_whitening.
-
-    Arguments:
-        x: sample data. Should have rank 4.
-            In case of grayscale data, the channels axis should have value 1
-            and in case of RGB data, it should have value 3.
-        augment: Boolean (default: False). Whether to fit on randomly
-            augmented samples.
-        rounds: int (default: 1). If augment, how many augmentation passes
-            over the data to use.
-        seed: int (default: None). Random seed.
-
-    Raises:
-        ValueError: If input rank is not 4.
-        ImportError: If scipy is not imported.
-    """
-    x = np.asarray(x, dtype=K.floatx())
-    if x.ndim != 4:
-      raise ValueError('Input to `.fit()` should have rank 4. '
-                       'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {1, 3, 4}:
-      logging.warning(
-          'Expected input to be images (as Numpy array) '
-          'following the data format convention "' + self.data_format + '" '
-          '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
-          'either 1, 3 or 4 channels on axis ' + str(self.channel_axis) + '. '
-          'However, it was passed an array with shape ' + str(x.shape) + ' (' +
-          str(x.shape[self.channel_axis]) + ' channels).')
-
-    if seed is not None:
-      np.random.seed(seed)
-
-    x = np.copy(x)
-    if augment:
-      ax = np.zeros(
-          tuple([rounds * x.shape[0]] + list(x.shape)[1:]), dtype=K.floatx())
-      for r in range(rounds):
-        for i in range(x.shape[0]):
-          ax[i + r * x.shape[0]] = self.random_transform(x[i])
-      x = ax
-
-    if self.featurewise_center:
-      self.mean = np.mean(x, axis=(0, self.row_axis, self.col_axis))
-      broadcast_shape = [1, 1, 1]
-      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
-      self.mean = np.reshape(self.mean, broadcast_shape)
-      x -= self.mean
-
-    if self.featurewise_std_normalization:
-      self.std = np.std(x, axis=(0, self.row_axis, self.col_axis))
-      broadcast_shape = [1, 1, 1]
-      broadcast_shape[self.channel_axis - 1] = x.shape[self.channel_axis]
-      self.std = np.reshape(self.std, broadcast_shape)
-      x /= (self.std + K.epsilon())
-
-    if self.zca_whitening:
-      if linalg is None:
-        raise ImportError('Scipy is required for zca_whitening.')
-
-      flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
-      u, s, _ = linalg.svd(sigma)
-      s_inv = 1. / np.sqrt(s[np.newaxis] + self.zca_epsilon)
-      self.principal_components = (u * s_inv).dot(u.T)
-
-
-@tf_export('keras.preprocessing.image.Iterator')
-class Iterator(Sequence):
-  """Base class for image data iterators.
-
-  Every `Iterator` must implement the `_get_batches_of_transformed_samples`
-  method.
-
-  Arguments:
-      n: Integer, total number of samples in the dataset to loop over.
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seeding for data shuffling.
-  """
-
-  def __init__(self, n, batch_size, shuffle, seed):
-    self.n = n
-    self.batch_size = batch_size
-    self.seed = seed
-    self.shuffle = shuffle
-    self.batch_index = 0
-    self.total_batches_seen = 0
-    self.lock = threading.Lock()
-    self.index_array = None
-    self.index_generator = self._flow_index()
-
-  def _set_index_array(self):
-    self.index_array = np.arange(self.n)
-    if self.shuffle:
-      self.index_array = np.random.permutation(self.n)
-
-  def __getitem__(self, idx):
-    if idx >= len(self):
-      raise ValueError('Asked to retrieve element {idx}, '
-                       'but the Sequence '
-                       'has length {length}'.format(idx=idx, length=len(self)))
-    if self.seed is not None:
-      np.random.seed(self.seed + self.total_batches_seen)
-    self.total_batches_seen += 1
-    if self.index_array is None:
-      self._set_index_array()
-    index_array = self.index_array[self.batch_size * idx:self.batch_size * (
-        idx + 1)]
-    return self._get_batches_of_transformed_samples(index_array)
-
-  def __len__(self):
-    return (self.n + self.batch_size - 1) // self.batch_size  # round up
-
-  def on_epoch_end(self):
-    self._set_index_array()
-
-  def reset(self):
-    self.batch_index = 0
-
-  def _flow_index(self):
-    # Ensure self.batch_index is 0.
-    self.reset()
-    while 1:
-      if self.seed is not None:
-        np.random.seed(self.seed + self.total_batches_seen)
-      if self.batch_index == 0:
-        self._set_index_array()
-
-      current_index = (self.batch_index * self.batch_size) % self.n
-      if self.n > current_index + self.batch_size:
-        self.batch_index += 1
-      else:
-        self.batch_index = 0
-      self.total_batches_seen += 1
-      yield self.index_array[current_index:current_index + self.batch_size]
-
-  def __iter__(self):  # pylint: disable=non-iterator-returned
-    # Needed if we want to do something like:
-    # for x, y in data_gen.flow(...):
-    return self
-
-  def __next__(self, *args, **kwargs):
-    return self.next(*args, **kwargs)
-
-  def _get_batches_of_transformed_samples(self, index_array):
-    """Gets a batch of transformed samples.
-
-    Arguments:
-        index_array: array of sample indices to include in batch.
-
-    Returns:
-        A batch of transformed samples.
-    """
-    raise NotImplementedError
-
-
-@tf_export('keras.preprocessing.image.NumpyArrayIterator')
-class NumpyArrayIterator(Iterator):
-  """Iterator yielding data from a Numpy array.
-
-  Arguments:
-      x: Numpy array of input data.
-      y: Numpy array of targets data.
-      image_data_generator: Instance of `ImageDataGenerator`
-          to use for random transformations and normalization.
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seed for data shuffling.
-      data_format: String, one of `channels_first`, `channels_last`.
-      save_to_dir: Optional directory where to save the pictures
-          being yielded, in a viewable format. This is useful
-          for visualizing the random transformations being
-          applied, for debugging purposes.
-      save_prefix: String prefix to use for saving sample
-          images (if `save_to_dir` is set).
-      save_format: Format to use for saving sample images
-          (if `save_to_dir` is set).
-      subset: Subset of data (`"training"` or `"validation"`) if
-          validation_split is set in ImageDataGenerator.
-  """
-
-  def __init__(self,
-               x,
-               y,
-               image_data_generator,
-               batch_size=32,
-               shuffle=False,
-               seed=None,
-               data_format=None,
-               save_to_dir=None,
-               save_prefix='',
-               save_format='png',
-               subset=None):
-    if y is not None and len(x) != len(y):
-      raise ValueError('`x` (images tensor) and `y` (labels) '
-                       'should have the same length. '
-                       'Found: x.shape = %s, y.shape = %s' %
-                       (np.asarray(x).shape, np.asarray(y).shape))
-    if subset is not None:
-      if subset not in {'training', 'validation'}:
-        raise ValueError('Invalid subset name:', subset,
-                         '; expected "training" or "validation".')
-      split_idx = int(len(x) * image_data_generator.validation_split)
-      if subset == 'validation':
-        x = x[:split_idx]
-        if y is not None:
-          y = y[:split_idx]
-      else:
-        x = x[split_idx:]
-        if y is not None:
-          y = y[split_idx:]
-    if data_format is None:
-      data_format = K.image_data_format()
-    self.x = np.asarray(x, dtype=K.floatx())
-    if self.x.ndim != 4:
-      raise ValueError('Input data in `NumpyArrayIterator` '
-                       'should have rank 4. You passed an array '
-                       'with shape', self.x.shape)
-    channels_axis = 3 if data_format == 'channels_last' else 1
-    if self.x.shape[channels_axis] not in {1, 3, 4}:
-      logging.warning(
-          'NumpyArrayIterator is set to use the '
-          'data format convention "' + data_format + '" '
-          '(channels on axis ' + str(channels_axis) + '), i.e. expected '
-          'either 1, 3 or 4 channels on axis ' + str(channels_axis) + '. '
-          'However, it was passed an array with shape ' + str(self.x.shape) +
-          ' (' + str(self.x.shape[channels_axis]) + ' channels).')
-    if y is not None:
-      self.y = np.asarray(y)
-    else:
-      self.y = None
-    self.image_data_generator = image_data_generator
-    self.data_format = data_format
-    self.save_to_dir = save_to_dir
-    self.save_prefix = save_prefix
-    self.save_format = save_format
-    super(NumpyArrayIterator, self).__init__(x.shape[0], batch_size, shuffle,
-                                             seed)
-
-  def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros(
-        tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=K.floatx())
-    for i, j in enumerate(index_array):
-      x = self.x[j]
-      x = self.image_data_generator.random_transform(x.astype(K.floatx()))
-      x = self.image_data_generator.standardize(x)
-      batch_x[i] = x
-    if self.save_to_dir:
-      for i, j in enumerate(index_array):
-        img = array_to_img(batch_x[i], self.data_format, scale=True)
-        fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=j,
-            hash=np.random.randint(1e4),
-            format=self.save_format)
-        img.save(os.path.join(self.save_to_dir, fname))
-    if self.y is None:
-      return batch_x
-    batch_y = self.y[index_array]
-    return batch_x, batch_y
-
-  def next(self):
-    """For python 2.x.
-
-    Returns:
-        The next batch.
-    """
-    # Keeps under lock only the mechanism which advances
-    # the indexing of each batch.
-    with self.lock:
-      index_array = next(self.index_generator)
-    # The transformation of images is not under thread lock
-    # so it can be done in parallel
-    return self._get_batches_of_transformed_samples(index_array)
-
-
-def _iter_valid_files(directory, white_list_formats, follow_links):
-  """Count files with extension in `white_list_formats` contained in directory.
-
-  Arguments:
-      directory: absolute path to the directory
-          containing files to be counted
-      white_list_formats: set of strings containing allowed extensions for
-          the files to be counted.
-      follow_links: boolean.
-
-  Yields:
-      tuple of (root, filename) with extension in `white_list_formats`.
-  """
-
-  def _recursive_list(subpath):
-    return sorted(
-        os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
-
-  for root, _, files in _recursive_list(directory):
-    for fname in sorted(files):
-      for extension in white_list_formats:
-        if fname.lower().endswith('.tiff'):
-          logging.warning(
-              'Using \'.tiff\' files with multiple bands will cause '
-              'distortion. Please verify your output.')
-        if fname.lower().endswith('.' + extension):
-          yield root, fname
-
-
-def _count_valid_files_in_directory(directory, white_list_formats, split,
-                                    follow_links):
-  """Count files with extension in `white_list_formats` contained in directory.
-
-  Arguments:
-      directory: absolute path to the directory
-          containing files to be counted
-      white_list_formats: set of strings containing allowed extensions for
-          the files to be counted.
-      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
-          account a certain fraction of files in each directory.
-          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
-          of images in each directory.
-      follow_links: boolean.
-
-  Returns:
-      the count of files with extension in `white_list_formats` contained in
-      the directory.
-  """
-  num_files = len(
-      list(_iter_valid_files(directory, white_list_formats, follow_links)))
-  if split:
-    start, stop = int(split[0] * num_files), int(split[1] * num_files)
-  else:
-    start, stop = 0, num_files
-  return stop - start
-
-
-def _list_valid_filenames_in_directory(directory, white_list_formats, split,
-                                       class_indices, follow_links):
-  """List paths of files in `subdir` with extensions in `white_list_formats`.
-
-  Arguments:
-      directory: absolute path to a directory containing the files to list.
-          The directory name is used as class label and must be a key of
-            `class_indices`.
-      white_list_formats: set of strings containing allowed extensions for
-          the files to be counted.
-      split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
-          account a certain fraction of files in each directory.
-          E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
-          of images in each directory.
-      class_indices: dictionary mapping a class name to its index.
-      follow_links: boolean.
-
-  Returns:
-      classes: a list of class indices
-      filenames: the path of valid files in `directory`, relative from
-          `directory`'s parent (e.g., if `directory` is "dataset/class1",
-          the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
-  """
-  dirname = os.path.basename(directory)
-  if split:
-    num_files = len(
-        list(_iter_valid_files(directory, white_list_formats, follow_links)))
-    start, stop = int(split[0] * num_files), int(split[1] * num_files)
-    valid_files = list(
-        _iter_valid_files(directory, white_list_formats,
-                          follow_links))[start:stop]
-  else:
-    valid_files = _iter_valid_files(directory, white_list_formats, follow_links)
-
-  classes = []
-  filenames = []
-  for root, fname in valid_files:
-    classes.append(class_indices[dirname])
-    absolute_path = os.path.join(root, fname)
-    relative_path = os.path.join(dirname,
-                                 os.path.relpath(absolute_path, directory))
-    filenames.append(relative_path)
-
-  return classes, filenames
-
-
-@tf_export('keras.preprocessing.image.DirectoryIterator')
-class DirectoryIterator(Iterator):
-  """Iterator capable of reading images from a directory on disk.
-
-  Arguments:
-      directory: Path to the directory to read images from.
-          Each subdirectory in this directory will be
-          considered to contain images from one class,
-          or alternatively you could specify class subdirectories
-          via the `classes` argument.
-      image_data_generator: Instance of `ImageDataGenerator`
-          to use for random transformations and normalization.
-      target_size: tuple of integers, dimensions to resize input images to.
-      color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
-      classes: Optional list of strings, names of subdirectories
-          containing images from each class (e.g. `["dogs", "cats"]`).
-          It will be computed automatically if not set.
-      class_mode: Mode for yielding the targets:
-          `"binary"`: binary targets (if there are only two classes),
-          `"categorical"`: categorical targets,
-          `"sparse"`: integer targets,
-          `"input"`: targets are images identical to input images (mainly
-              used to work with autoencoders),
-          `None`: no targets get yielded (only input images are yielded).
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seed for data shuffling.
-      data_format: String, one of `channels_first`, `channels_last`.
-      save_to_dir: Optional directory where to save the pictures
-          being yielded, in a viewable format. This is useful
-          for visualizing the random transformations being
-          applied, for debugging purposes.
-      save_prefix: String prefix to use for saving sample
-          images (if `save_to_dir` is set).
-      save_format: Format to use for saving sample images
-          (if `save_to_dir` is set).
-      subset: Subset of data (`"training"` or `"validation"`) if
-          validation_split is set in ImageDataGenerator.
-      interpolation: Interpolation method used to resample the image if the
-          target size is different from that of the loaded image.
-          Supported methods are "nearest", "bilinear", and "bicubic".
-          If PIL version 1.1.3 or newer is installed, "lanczos" is also
-          supported. If PIL version 3.4.0 or newer is installed, "box" and
-          "hamming" are also supported. By default, "nearest" is used.
-  """
-
-  def __init__(self,
-               directory,
-               image_data_generator,
-               target_size=(256, 256),
-               color_mode='rgb',
-               classes=None,
-               class_mode='categorical',
-               batch_size=32,
-               shuffle=True,
-               seed=None,
-               data_format=None,
-               save_to_dir=None,
-               save_prefix='',
-               save_format='png',
-               follow_links=False,
-               subset=None,
-               interpolation='nearest'):
-    if data_format is None:
-      data_format = K.image_data_format()
-    self.directory = directory
-    self.image_data_generator = image_data_generator
-    self.target_size = tuple(target_size)
-    if color_mode not in {'rgb', 'grayscale'}:
-      raise ValueError('Invalid color mode:', color_mode,
-                       '; expected "rgb" or "grayscale".')
-    self.color_mode = color_mode
-    self.data_format = data_format
-    if self.color_mode == 'rgb':
-      if self.data_format == 'channels_last':
-        self.image_shape = self.target_size + (3,)
-      else:
-        self.image_shape = (3,) + self.target_size
-    else:
-      if self.data_format == 'channels_last':
-        self.image_shape = self.target_size + (1,)
-      else:
-        self.image_shape = (1,) + self.target_size
-    self.classes = classes
-    if class_mode not in {'categorical', 'binary', 'sparse', 'input', None}:
-      raise ValueError('Invalid class_mode:', class_mode,
-                       '; expected one of "categorical", '
-                       '"binary", "sparse", "input"'
-                       ' or None.')
-    self.class_mode = class_mode
-    self.save_to_dir = save_to_dir
-    self.save_prefix = save_prefix
-    self.save_format = save_format
-    self.interpolation = interpolation
-
-    if subset is not None:
-      validation_split = self.image_data_generator.validation_split
-      if subset == 'validation':
-        split = (0, validation_split)
-      elif subset == 'training':
-        split = (validation_split, 1)
-      else:
-        raise ValueError('Invalid subset name: ', subset,
-                         '; expected "training" or "validation"')
-    else:
-      split = None
-    self.subset = subset
-
-    white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'}
-
-    # first, count the number of samples and classes
-    self.samples = 0
-
-    if not classes:
-      classes = []
-      for subdir in sorted(os.listdir(directory)):
-        if os.path.isdir(os.path.join(directory, subdir)):
-          classes.append(subdir)
-    self.num_classes = len(classes)
-    self.class_indices = dict(zip(classes, range(len(classes))))
-
-    pool = multiprocessing.pool.ThreadPool()
-    function_partial = partial(
-        _count_valid_files_in_directory,
-        white_list_formats=white_list_formats,
-        follow_links=follow_links,
-        split=split)
-    self.samples = sum(
-        pool.map(function_partial,
-                 (os.path.join(directory, subdir) for subdir in classes)))
-
-    print('Found %d images belonging to %d classes.' % (self.samples,
-                                                        self.num_classes))
-
-    # second, build an index of the images in the different class subfolders
-    results = []
-
-    self.filenames = []
-    self.classes = np.zeros((self.samples,), dtype='int32')
-    i = 0
-    for dirpath in (os.path.join(directory, subdir) for subdir in classes):
-      results.append(
-          pool.apply_async(_list_valid_filenames_in_directory,
-                           (dirpath, white_list_formats, split,
-                            self.class_indices, follow_links)))
-    for res in results:
-      classes, filenames = res.get()
-      self.classes[i:i + len(classes)] = classes
-      self.filenames += filenames
-      i += len(classes)
-
-    pool.close()
-    pool.join()
-    super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
-                                            seed)
-
-  def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=K.floatx())
-    grayscale = self.color_mode == 'grayscale'
-    # build batch of image data
-    for i, j in enumerate(index_array):
-      fname = self.filenames[j]
-      img = load_img(
-          os.path.join(self.directory, fname),
-          grayscale=grayscale,
-          target_size=self.target_size,
-          interpolation=self.interpolation)
-      x = img_to_array(img, data_format=self.data_format)
-      x = self.image_data_generator.random_transform(x)
-      x = self.image_data_generator.standardize(x)
-      batch_x[i] = x
-    # optionally save augmented images to disk for debugging purposes
-    if self.save_to_dir:
-      for i, j in enumerate(index_array):
-        img = array_to_img(batch_x[i], self.data_format, scale=True)
-        fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=j,
-            hash=np.random.randint(1e7),
-            format=self.save_format)
-        img.save(os.path.join(self.save_to_dir, fname))
-    # build batch of labels
-    if self.class_mode == 'input':
-      batch_y = batch_x.copy()
-    elif self.class_mode == 'sparse':
-      batch_y = self.classes[index_array]
-    elif self.class_mode == 'binary':
-      batch_y = self.classes[index_array].astype(K.floatx())
-    elif self.class_mode == 'categorical':
-      batch_y = np.zeros((len(batch_x), self.num_classes), dtype=K.floatx())
-      for i, label in enumerate(self.classes[index_array]):
-        batch_y[i, label] = 1.
-    else:
-      return batch_x
-    return batch_x, batch_y
-
-  def next(self):
-    """For python 2.x.
-
-    Returns:
-        The next batch.
-    """
-    with self.lock:
-      index_array = next(self.index_generator)
-    # The transformation of images is not under thread lock
-    # so it can be done in parallel
-    return self._get_batches_of_transformed_samples(index_array)
+tf_export('keras.preprocessing.image.random_rotation')(random_rotation)
+tf_export('keras.preprocessing.image.random_shift')(random_shift)
+tf_export('keras.preprocessing.image.random_shear')(random_shear)
+tf_export('keras.preprocessing.image.random_zoom')(random_zoom)
+tf_export('keras.preprocessing.image.apply_channel_shift')(apply_channel_shift)
+tf_export(
+    'keras.preprocessing.image.random_channel_shift')(random_channel_shift)
+tf_export(
+    'keras.preprocessing.image.apply_brightness_shift')(apply_brightness_shift)
+tf_export('keras.preprocessing.image.random_brightness')(random_brightness)
+tf_export(
+    'keras.preprocessing.image.apply_affine_transform')(apply_affine_transform)
+tf_export('keras.preprocessing.image.array_to_img')(array_to_img)
+tf_export('keras.preprocessing.image.img_to_array')(img_to_array)
+tf_export('keras.preprocessing.image.save_img')(save_img)
+tf_export('keras.preprocessing.image.load_img')(load_img)
+tf_export('keras.preprocessing.image.ImageDataGenerator')(ImageDataGenerator)
+tf_export('keras.preprocessing.image.Iterator')(Iterator)
+tf_export('keras.preprocessing.image.NumpyArrayIterator')(NumpyArrayIterator)
+tf_export('keras.preprocessing.image.DirectoryIterator')(DirectoryIterator)
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 275808a..362cbc1 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -161,9 +161,6 @@
 
     generator = keras.preprocessing.image.ImageDataGenerator(
         zoom_range=(2, 2))
-    with self.assertRaises(ValueError):
-      generator = keras.preprocessing.image.ImageDataGenerator(
-          zoom_range=(2, 2, 2))
 
   def test_image_data_generator_fit(self):
     generator = keras.preprocessing.image.ImageDataGenerator(
diff --git a/tensorflow/python/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
index e0924f8..116d310 100644
--- a/tensorflow/python/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/preprocessing/sequence.py
@@ -14,383 +14,25 @@
 # ==============================================================================
 """Utilities for preprocessing sequence data.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import random
+from keras_preprocessing import sequence
 
-import numpy as np
-from six.moves import range  # pylint: disable=redefined-builtin
-
-from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.util.tf_export import tf_export
 
+pad_sequences = sequence.pad_sequences
+make_sampling_table = sequence.make_sampling_table
+skipgrams = sequence.skipgrams
+# TODO(fchollet): consider making `_remove_long_seq` public.
+_remove_long_seq = sequence._remove_long_seq  # pylint: disable=protected-access
+TimeseriesGenerator = sequence.TimeseriesGenerator
 
-@tf_export('keras.preprocessing.sequence.pad_sequences')
-def pad_sequences(sequences,
-                  maxlen=None,
-                  dtype='int32',
-                  padding='pre',
-                  truncating='pre',
-                  value=0.):
-  """Pads sequences to the same length.
-
-  This function transforms a list of
-  `num_samples` sequences (lists of integers)
-  into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
-  `num_timesteps` is either the `maxlen` argument if provided,
-  or the length of the longest sequence otherwise.
-
-  Sequences that are shorter than `num_timesteps`
-  are padded with `value` at the end.
-
-  Sequences longer than `num_timesteps` are truncated
-  so that they fit the desired length.
-  The position where padding or truncation happens is determined by
-  the arguments `padding` and `truncating`, respectively.
-
-  Pre-padding is the default.
-
-  Arguments:
-      sequences: List of lists, where each element is a sequence.
-      maxlen: Int, maximum length of all sequences.
-      dtype: Type of the output sequences.
-      padding: String, 'pre' or 'post':
-          pad either before or after each sequence.
-      truncating: String, 'pre' or 'post':
-          remove values from sequences larger than
-          `maxlen`, either at the beginning or at the end of the sequences.
-      value: Float, padding value.
-
-  Returns:
-      x: Numpy array with shape `(len(sequences), maxlen)`
-
-  Raises:
-      ValueError: In case of invalid values for `truncating` or `padding`,
-          or in case of invalid shape for a `sequences` entry.
-  """
-  if not hasattr(sequences, '__len__'):
-    raise ValueError('`sequences` must be iterable.')
-  lengths = []
-  for x in sequences:
-    if not hasattr(x, '__len__'):
-      raise ValueError('`sequences` must be a list of iterables. '
-                       'Found non-iterable: ' + str(x))
-    lengths.append(len(x))
-
-  num_samples = len(sequences)
-  if maxlen is None:
-    maxlen = np.max(lengths)
-
-  # take the sample shape from the first non empty sequence
-  # checking for consistency in the main loop below.
-  sample_shape = tuple()
-  for s in sequences:
-    if len(s) > 0:  # pylint: disable=g-explicit-length-test
-      sample_shape = np.asarray(s).shape[1:]
-      break
-
-  x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
-  for idx, s in enumerate(sequences):
-    if not len(s):  # pylint: disable=g-explicit-length-test
-      continue  # empty list/array was found
-    if truncating == 'pre':
-      trunc = s[-maxlen:]  # pylint: disable=invalid-unary-operand-type
-    elif truncating == 'post':
-      trunc = s[:maxlen]
-    else:
-      raise ValueError('Truncating type "%s" not understood' % truncating)
-
-    # check `trunc` has expected shape
-    trunc = np.asarray(trunc, dtype=dtype)
-    if trunc.shape[1:] != sample_shape:
-      raise ValueError('Shape of sample %s of sequence at position %s '
-                       'is different from expected shape %s' %
-                       (trunc.shape[1:], idx, sample_shape))
-
-    if padding == 'post':
-      x[idx, :len(trunc)] = trunc
-    elif padding == 'pre':
-      x[idx, -len(trunc):] = trunc
-    else:
-      raise ValueError('Padding type "%s" not understood' % padding)
-  return x
-
-
-@tf_export('keras.preprocessing.sequence.make_sampling_table')
-def make_sampling_table(size, sampling_factor=1e-5):
-  """Generates a word rank-based probabilistic sampling table.
-
-  Used for generating the `sampling_table` argument for `skipgrams`.
-  `sampling_table[i]` is the probability of sampling
-  the word i-th most common word in a dataset
-  (more common words should be sampled less frequently, for balance).
-
-  The sampling probabilities are generated according
-  to the sampling distribution used in word2vec:
-
-  `p(word) = min(1, sqrt(word_frequency / sampling_factor) / (word_frequency /
-  sampling_factor))`
-
-  We assume that the word frequencies follow Zipf's law (s=1) to derive
-  a numerical approximation of frequency(rank):
-
-  `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
-  where `gamma` is the Euler-Mascheroni constant.
-
-  Arguments:
-      size: Int, number of possible words to sample.
-      sampling_factor: The sampling factor in the word2vec formula.
-
-  Returns:
-      A 1D Numpy array of length `size` where the ith entry
-      is the probability that a word of rank i should be sampled.
-  """
-  gamma = 0.577
-  rank = np.arange(size)
-  rank[0] = 1
-  inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
-  f = sampling_factor * inv_fq
-
-  return np.minimum(1., f / np.sqrt(f))
-
-
-@tf_export('keras.preprocessing.sequence.skipgrams')
-def skipgrams(sequence,
-              vocabulary_size,
-              window_size=4,
-              negative_samples=1.,
-              shuffle=True,
-              categorical=False,
-              sampling_table=None,
-              seed=None):
-  """Generates skipgram word pairs.
-
-  This function transforms a sequence of word indexes (list of integers)
-  into tuples of words of the form:
-
-  - (word, word in the same window), with label 1 (positive samples).
-  - (word, random word from the vocabulary), with label 0 (negative samples).
-
-  Read more about Skipgram in this gnomic paper by Mikolov et al.:
-  [Efficient Estimation of Word Representations in
-  Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
-
-  Arguments:
-      sequence: A word sequence (sentence), encoded as a list
-          of word indices (integers). If using a `sampling_table`,
-          word indices are expected to match the rank
-          of the words in a reference dataset (e.g. 10 would encode
-          the 10-th most frequently occurring token).
-          Note that index 0 is expected to be a non-word and will be skipped.
-      vocabulary_size: Int, maximum possible word index + 1
-      window_size: Int, size of sampling windows (technically half-window).
-          The window of a word `w_i` will be
-          `[i - window_size, i + window_size+1]`.
-      negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
-          1 for same number as positive samples.
-      shuffle: Whether to shuffle the word couples before returning them.
-      categorical: bool. if False, labels will be
-          integers (eg. `[0, 1, 1 .. ]`),
-          if `True`, labels will be categorical, e.g.
-          `[[1,0],[0,1],[0,1] .. ]`.
-      sampling_table: 1D array of size `vocabulary_size` where the entry i
-          encodes the probability to sample a word of rank i.
-      seed: Random seed.
-
-  Returns:
-      couples, labels: where `couples` are int pairs and
-          `labels` are either 0 or 1.
-
-  # Note
-      By convention, index 0 in the vocabulary is
-      a non-word and will be skipped.
-  """
-  couples = []
-  labels = []
-  for i, wi in enumerate(sequence):
-    if not wi:
-      continue
-    if sampling_table is not None:
-      if sampling_table[wi] < random.random():
-        continue
-
-    window_start = max(0, i - window_size)
-    window_end = min(len(sequence), i + window_size + 1)
-    for j in range(window_start, window_end):
-      if j != i:
-        wj = sequence[j]
-        if not wj:
-          continue
-        couples.append([wi, wj])
-        if categorical:
-          labels.append([0, 1])
-        else:
-          labels.append(1)
-
-  if negative_samples > 0:
-    num_negative_samples = int(len(labels) * negative_samples)
-    words = [c[0] for c in couples]
-    random.shuffle(words)
-
-    couples += [[words[i % len(words)],
-                 random.randint(1, vocabulary_size - 1)]
-                for i in range(num_negative_samples)]
-    if categorical:
-      labels += [[1, 0]] * num_negative_samples
-    else:
-      labels += [0] * num_negative_samples
-
-  if shuffle:
-    if seed is None:
-      seed = random.randint(0, 10e6)
-    random.seed(seed)
-    random.shuffle(couples)
-    random.seed(seed)
-    random.shuffle(labels)
-
-  return couples, labels
-
-
-def _remove_long_seq(maxlen, seq, label):
-  """Removes sequences that exceed the maximum length.
-
-  Arguments:
-      maxlen: Int, maximum length of the output sequences.
-      seq: List of lists, where each sublist is a sequence.
-      label: List where each element is an integer.
-
-  Returns:
-      new_seq, new_label: shortened lists for `seq` and `label`.
-  """
-  new_seq, new_label = [], []
-  for x, y in zip(seq, label):
-    if len(x) < maxlen:
-      new_seq.append(x)
-      new_label.append(y)
-  return new_seq, new_label
-
-
-@tf_export('keras.preprocessing.sequence.TimeseriesGenerator')
-class TimeseriesGenerator(Sequence):
-  """Utility class for generating batches of temporal data.
-
-  This class takes in a sequence of data-points gathered at
-  equal intervals, along with time series parameters such as
-  stride, length of history, etc., to produce batches for
-  training/validation.
-
-  Arguments:
-      data: Indexable generator (such as list or Numpy array)
-          containing consecutive data points (timesteps).
-          The data should be at 2D, and axis 0 is expected
-          to be the time dimension.
-      targets: Targets corresponding to timesteps in `data`.
-          It should have same length as `data`.
-      length: Length of the output sequences (in number of timesteps).
-      sampling_rate: Period between successive individual timesteps
-          within sequences. For rate `r`, timesteps
-          `data[i]`, `data[i-r]`, ... `data[i - length]`
-          are used for create a sample sequence.
-      stride: Period between successive output sequences.
-          For stride `s`, consecutive output samples would
-          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
-      start_index, end_index: Data points earlier than `start_index`
-          or later than `end_index` will not be used in the output sequences.
-          This is useful to reserve part of the data for test or validation.
-      shuffle: Whether to shuffle output samples,
-          or instead draw them in chronological order.
-      reverse: Boolean: if `true`, timesteps in each output sample will be
-          in reverse chronological order.
-      batch_size: Number of timeseries samples in each batch
-          (except maybe the last one).
-
-  Returns:
-      A [Sequence](/utils/#sequence) instance.
-
-  Examples:
-
-  ```python
-  from keras.preprocessing.sequence import TimeseriesGenerator
-  import numpy as np
-
-  data = np.array([[i] for i in range(50)])
-  targets = np.array([[i] for i in range(50)])
-
-  data_gen = TimeseriesGenerator(data, targets,
-                                 length=10, sampling_rate=2,
-                                 batch_size=2)
-  assert len(data_gen) == 20
-
-  batch_0 = data_gen[0]
-  x, y = batch_0
-  assert np.array_equal(x,
-                        np.array([[[0], [2], [4], [6], [8]],
-                                  [[1], [3], [5], [7], [9]]]))
-  assert np.array_equal(y,
-                        np.array([[10], [11]]))
-  ```
-  """
-
-  def __init__(self,
-               data,
-               targets,
-               length,
-               sampling_rate=1,
-               stride=1,
-               start_index=0,
-               end_index=None,
-               shuffle=False,
-               reverse=False,
-               batch_size=128):
-    self.data = data
-    self.targets = targets
-    self.length = length
-    self.sampling_rate = sampling_rate
-    self.stride = stride
-    self.start_index = start_index + length
-    if end_index is None:
-      end_index = len(data) - 1
-    self.end_index = end_index
-    self.shuffle = shuffle
-    self.reverse = reverse
-    self.batch_size = batch_size
-
-    if self.start_index > self.end_index:
-      raise ValueError('`start_index+length=%i > end_index=%i` '
-                       'is disallowed, as no part of the sequence '
-                       'would be left to be used as current step.' %
-                       (self.start_index, self.end_index))
-
-  def __len__(self):
-    length = int(
-        np.ceil((self.end_index - self.start_index + 1) /
-                (self.batch_size * self.stride)))
-    return length if length >= 0 else 0
-
-  def _empty_batch(self, num_rows):
-    samples_shape = [num_rows, self.length // self.sampling_rate]
-    samples_shape.extend(self.data.shape[1:])
-    targets_shape = [num_rows]
-    targets_shape.extend(self.targets.shape[1:])
-    return np.empty(samples_shape), np.empty(targets_shape)
-
-  def __getitem__(self, index):
-    if self.shuffle:
-      rows = np.random.randint(
-          self.start_index, self.end_index + 1, size=self.batch_size)
-    else:
-      i = self.start_index + self.batch_size * self.stride * index
-      rows = np.arange(
-          i, min(i + self.batch_size * self.stride, self.end_index + 1),
-          self.stride)
-
-    samples, targets = self._empty_batch(len(rows))
-    for j in range(len(rows)):
-      indices = range(rows[j] - self.length, rows[j], self.sampling_rate)
-      samples[j] = self.data[indices]
-      targets[j] = self.targets[rows[j]]
-    if self.reverse:
-      return samples[:, ::-1, ...], targets
-    return samples, targets
+tf_export('keras.preprocessing.sequence.pad_sequences')(pad_sequences)
+tf_export(
+    'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
+tf_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
+tf_export(
+    'keras.preprocessing.sequence.TimeseriesGenerator')(TimeseriesGenerator)
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
index f3b57de..57e5d00 100644
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -14,383 +14,22 @@
 # ==============================================================================
 """Utilities for text input preprocessing.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import OrderedDict
-from hashlib import md5
-import string
-import sys
+from keras_preprocessing import text
 
-import numpy as np
-from six.moves import range  # pylint: disable=redefined-builtin
-from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
+text_to_word_sequence = text.text_to_word_sequence
+one_hot = text.one_hot
+hashing_trick = text.hashing_trick
+Tokenizer = text.Tokenizer
 
-if sys.version_info < (3,):
-  maketrans = string.maketrans
-else:
-  maketrans = str.maketrans
-
-
-@tf_export('keras.preprocessing.text.text_to_word_sequence')
-def text_to_word_sequence(text,
-                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-                          lower=True,
-                          split=' '):
-  r"""Converts a text to a sequence of words (or tokens).
-
-  Arguments:
-      text: Input text (string).
-      filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-          includes basic punctuation, tabs, and newlines.
-      lower: boolean, whether to convert the input to lowercase.
-      split: string, separator for word splitting.
-
-  Returns:
-      A list of words (or tokens).
-  """
-  if lower:
-    text = text.lower()
-
-  if sys.version_info < (3,):
-    if isinstance(text, unicode):
-      translate_map = dict((ord(c), unicode(split)) for c in filters)
-      text = text.translate(translate_map)
-    elif len(split) == 1:
-      translate_map = maketrans(filters, split * len(filters))
-      text = text.translate(translate_map)
-    else:
-      for c in filters:
-        text = text.replace(c, split)
-  else:
-    translate_dict = dict((c, split) for c in filters)
-    translate_map = maketrans(translate_dict)
-    text = text.translate(translate_map)
-
-  seq = text.split(split)
-  return [i for i in seq if i]
-
-
-@tf_export('keras.preprocessing.text.one_hot')
-def one_hot(text,
-            n,
-            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-            lower=True,
-            split=' '):
-  r"""One-hot encodes a text into a list of word indexes of size n.
-
-  This is a wrapper to the `hashing_trick` function using `hash` as the
-  hashing function; unicity of word to index mapping non-guaranteed.
-
-  Arguments:
-      text: Input text (string).
-      n: int, size of vocabulary.
-      filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-          includes basic punctuation, tabs, and newlines.
-      lower: boolean, whether to set the text to lowercase.
-      split: string, separator for word splitting.
-
-  Returns:
-      List of integers in [1, n].
-      Each integer encodes a word (unicity non-guaranteed).
-  """
-  return hashing_trick(
-      text, n, hash_function=hash, filters=filters, lower=lower, split=split)
-
-
-@tf_export('keras.preprocessing.text.hashing_trick')
-def hashing_trick(text,
-                  n,
-                  hash_function=None,
-                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-                  lower=True,
-                  split=' '):
-  r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
-
-  Arguments:
-      text: Input text (string).
-      n: Dimension of the hashing space.
-      hash_function: defaults to python `hash` function, can be 'md5' or
-          any function that takes in input a string and returns a int.
-          Note that 'hash' is not a stable hashing function, so
-          it is not consistent across different runs, while 'md5'
-          is a stable hashing function.
-      filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-          includes basic punctuation, tabs, and newlines.
-      lower: boolean, whether to set the text to lowercase.
-      split: string, separator for word splitting.
-
-  Returns:
-      A list of integer word indices (unicity non-guaranteed).
-
-  `0` is a reserved index that won't be assigned to any word.
-
-  Two or more words may be assigned to the same index, due to possible
-  collisions by the hashing function.
-  The
-  probability
-  of a collision is in relation to the dimension of the hashing space and
-  the number of distinct objects.
-  """
-  if hash_function is None:
-    hash_function = hash
-  elif hash_function == 'md5':
-    hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
-
-  seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
-  return [(hash_function(w) % (n - 1) + 1) for w in seq]
-
-
-@tf_export('keras.preprocessing.text.Tokenizer')
-class Tokenizer(object):
-  """Text tokenization utility class.
-
-  This class allows to vectorize a text corpus, by turning each
-  text into either a sequence of integers (each integer being the index
-  of a token in a dictionary) or into a vector where the coefficient
-  for each token could be binary, based on word count, based on tf-idf...
-
-  Arguments:
-      num_words: the maximum number of words to keep, based
-          on word frequency. Only the most common `num_words` words will
-          be kept.
-      filters: a string where each element is a character that will be
-          filtered from the texts. The default is all punctuation, plus
-          tabs and line breaks, minus the `'` character.
-      lower: boolean. Whether to convert the texts to lowercase.
-      split: string, separator for word splitting.
-      char_level: if True, every character will be treated as a token.
-      oov_token: if given, it will be added to word_index and used to
-          replace out-of-vocabulary words during text_to_sequence calls
-
-  By default, all punctuation is removed, turning the texts into
-  space-separated sequences of words
-  (words maybe include the `'` character). These sequences are then
-  split into lists of tokens. They will then be indexed or vectorized.
-
-  `0` is a reserved index that won't be assigned to any word.
-  """
-
-  def __init__(self,
-               num_words=None,
-               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
-               lower=True,
-               split=' ',
-               char_level=False,
-               oov_token=None,
-               **kwargs):
-    # Legacy support
-    if 'nb_words' in kwargs:
-      logging.warning('The `nb_words` argument in `Tokenizer` '
-                      'has been renamed `num_words`.')
-      num_words = kwargs.pop('nb_words')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-
-    self.word_counts = OrderedDict()
-    self.word_docs = {}
-    self.filters = filters
-    self.split = split
-    self.lower = lower
-    self.num_words = num_words
-    self.document_count = 0
-    self.char_level = char_level
-    self.oov_token = oov_token
-    self.index_docs = {}
-
-  def fit_on_texts(self, texts):
-    """Updates internal vocabulary based on a list of texts.
-
-    In the case where texts contains lists, we assume each entry of the lists
-    to be a token.
-
-    Required before using `texts_to_sequences` or `texts_to_matrix`.
-
-    Arguments:
-        texts: can be a list of strings,
-            a generator of strings (for memory-efficiency),
-            or a list of list of strings.
-    """
-    for text in texts:
-      self.document_count += 1
-      if self.char_level or isinstance(text, list):
-        seq = text
-      else:
-        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
-      for w in seq:
-        if w in self.word_counts:
-          self.word_counts[w] += 1
-        else:
-          self.word_counts[w] = 1
-      for w in set(seq):
-        if w in self.word_docs:
-          self.word_docs[w] += 1
-        else:
-          self.word_docs[w] = 1
-
-    wcounts = list(self.word_counts.items())
-    wcounts.sort(key=lambda x: x[1], reverse=True)
-    sorted_voc = [wc[0] for wc in wcounts]
-    # note that index 0 is reserved, never assigned to an existing word
-    self.word_index = dict(
-        list(zip(sorted_voc, list(range(1,
-                                        len(sorted_voc) + 1)))))
-
-    if self.oov_token is not None:
-      i = self.word_index.get(self.oov_token)
-      if i is None:
-        self.word_index[self.oov_token] = len(self.word_index) + 1
-
-    for w, c in list(self.word_docs.items()):
-      self.index_docs[self.word_index[w]] = c
-
-  def fit_on_sequences(self, sequences):
-    """Updates internal vocabulary based on a list of sequences.
-
-    Required before using `sequences_to_matrix`
-    (if `fit_on_texts` was never called).
-
-    Arguments:
-        sequences: A list of sequence.
-            A "sequence" is a list of integer word indices.
-    """
-    self.document_count += len(sequences)
-    for seq in sequences:
-      seq = set(seq)
-      for i in seq:
-        if i not in self.index_docs:
-          self.index_docs[i] = 1
-        else:
-          self.index_docs[i] += 1
-
-  def texts_to_sequences(self, texts):
-    """Transforms each text in texts in a sequence of integers.
-
-    Only top "num_words" most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
-
-    Arguments:
-        texts: A list of texts (strings).
-
-    Returns:
-        A list of sequences.
-    """
-    res = []
-    for vect in self.texts_to_sequences_generator(texts):
-      res.append(vect)
-    return res
-
-  def texts_to_sequences_generator(self, texts):
-    """Transforms each text in `texts` in a sequence of integers.
-
-    Each item in texts can also be a list, in which case we assume each item of
-    that list
-    to be a token.
-
-    Only top "num_words" most frequent words will be taken into account.
-    Only words known by the tokenizer will be taken into account.
-
-    Arguments:
-        texts: A list of texts (strings).
-
-    Yields:
-        Yields individual sequences.
-    """
-    num_words = self.num_words
-    for text in texts:
-      if self.char_level or isinstance(text, list):
-        seq = text
-      else:
-        seq = text_to_word_sequence(text, self.filters, self.lower, self.split)
-      vect = []
-      for w in seq:
-        i = self.word_index.get(w)
-        if i is not None:
-          if num_words and i >= num_words:
-            continue
-          else:
-            vect.append(i)
-        elif self.oov_token is not None:
-          i = self.word_index.get(self.oov_token)
-          if i is not None:
-            vect.append(i)
-      yield vect
-
-  def texts_to_matrix(self, texts, mode='binary'):
-    """Convert a list of texts to a Numpy matrix.
-
-    Arguments:
-        texts: list of strings.
-        mode: one of "binary", "count", "tfidf", "freq".
-
-    Returns:
-        A Numpy matrix.
-    """
-    sequences = self.texts_to_sequences(texts)
-    return self.sequences_to_matrix(sequences, mode=mode)
-
-  def sequences_to_matrix(self, sequences, mode='binary'):
-    """Converts a list of sequences into a Numpy matrix.
-
-    Arguments:
-        sequences: list of sequences
-            (a sequence is a list of integer word indices).
-        mode: one of "binary", "count", "tfidf", "freq"
-
-    Returns:
-        A Numpy matrix.
-
-    Raises:
-        ValueError: In case of invalid `mode` argument,
-            or if the Tokenizer requires to be fit to sample data.
-    """
-    if not self.num_words:
-      if self.word_index:
-        num_words = len(self.word_index) + 1
-      else:
-        raise ValueError('Specify a dimension (num_words argument), '
-                         'or fit on some text data first.')
-    else:
-      num_words = self.num_words
-
-    if mode == 'tfidf' and not self.document_count:
-      raise ValueError('Fit the Tokenizer on some data '
-                       'before using tfidf mode.')
-
-    x = np.zeros((len(sequences), num_words))
-    for i, seq in enumerate(sequences):
-      if not seq:
-        continue
-      counts = {}
-      for j in seq:
-        if j >= num_words:
-          continue
-        if j not in counts:
-          counts[j] = 1.
-        else:
-          counts[j] += 1
-      for j, c in list(counts.items()):
-        if mode == 'count':
-          x[i][j] = c
-        elif mode == 'freq':
-          x[i][j] = c / len(seq)
-        elif mode == 'binary':
-          x[i][j] = 1
-        elif mode == 'tfidf':
-          # Use weighting scheme 2 in
-          # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
-          tf = 1 + np.log(c)
-          idf = np.log(1 + self.document_count /
-                       (1 + self.index_docs.get(j, 0)))
-          x[i][j] = tf * idf
-        else:
-          raise ValueError('Unknown vectorization mode:', mode)
-    return x
+tf_export(
+    'keras.preprocessing.text.text_to_word_sequence')(text_to_word_sequence)
+tf_export('keras.preprocessing.text.one_hot')(one_hot)
+tf_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
+tf_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 69337b6..c442b31 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -31,6 +31,7 @@
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index 5419e7a..3a176c3 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
@@ -199,3 +200,168 @@
   no_flip = (slice(None, None), slice(None, None))
   slices[-2:] = no_flip
   return np.copy(kernel[slices])
+
+
+def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
+  """Compute a mask representing the connectivity of a convolution operation.
+
+  Assume a convolution with given parameters is applied to an input having N
+  spatial dimensions with `input_shape = (d_in1, ..., d_inN)` to produce an
+  output with shape `(d_out1, ..., d_outN)`. This method returns a boolean array
+  of shape `(d_in1, ..., d_inN, d_out1, ..., d_outN)` with `True` entries
+  indicating pairs of input and output locations that are connected by a weight.
+
+  Example:
+    ```python
+        >>> input_shape = (4,)
+        >>> kernel_shape = (2,)
+        >>> strides = (1,)
+        >>> padding = "valid"
+        >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
+        array([[ True, False, False],
+               [ True,  True, False],
+               [False,  True,  True],
+               [False, False,  True]], dtype=bool)
+    ```
+    where rows and columns correspond to inputs and outputs respectively.
+
+
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    A boolean 2N-D `np.ndarray` of shape
+    `(d_in1, ..., d_inN, d_out1, ..., d_outN)`, where `(d_out1, ..., d_outN)`
+    is the spatial shape of the output. `True` entries in the mask represent
+    pairs of input-output locations that are connected by a weight.
+
+  Raises:
+    ValueError: if `input_shape`, `kernel_shape` and `strides` don't have the
+        same number of dimensions.
+    NotImplementedError: if `padding` is not in {`"same"`, `"valid"`}.
+  """
+  if padding not in {'same', 'valid'}:
+    raise NotImplementedError('Padding type %s not supported. '
+                              'Only "valid" and "same" '
+                              'are implemented.' % padding)
+
+  in_dims = len(input_shape)
+  if isinstance(kernel_shape, int):
+    kernel_shape = (kernel_shape,) * in_dims
+  if isinstance(strides, int):
+    strides = (strides,) * in_dims
+
+  kernel_dims = len(kernel_shape)
+  stride_dims = len(strides)
+  if kernel_dims != in_dims or stride_dims != in_dims:
+    raise ValueError('Number of strides, input and kernel dimensions must all '
+                     'match. Received: %d, %d, %d.' %
+                     (stride_dims, in_dims, kernel_dims))
+
+  output_shape = conv_output_shape(input_shape, kernel_shape, strides, padding)
+
+  mask_shape = input_shape + output_shape
+  mask = np.zeros(mask_shape, np.bool)
+
+  output_axes_ticks = [range(dim) for dim in output_shape]
+  for output_position in itertools.product(*output_axes_ticks):
+    input_axes_ticks = conv_connected_inputs(input_shape,
+                                             kernel_shape,
+                                             output_position,
+                                             strides,
+                                             padding)
+    for input_position in itertools.product(*input_axes_ticks):
+      mask[input_position + output_position] = True
+
+  return mask
+
+
+def conv_connected_inputs(input_shape,
+                          kernel_shape,
+                          output_position,
+                          strides,
+                          padding):
+  """Return locations of the input connected to an output position.
+
+  Assume a convolution with given parameters is applied to an input having N
+  spatial dimensions with `input_shape = (d_in1, ..., d_inN)`. This method
+  returns N ranges specifying the input region that was convolved with the
+  kernel to produce the output at position
+  `output_position = (p_out1, ..., p_outN)`.
+
+  Example:
+    ```python
+        >>> input_shape = (4, 4)
+        >>> kernel_shape = (2, 1)
+        >>> output_position = (1, 1)
+        >>> strides = (1, 1)
+        >>> padding = "valid"
+        >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
+        >>>                       strides, padding)
+        [xrange(1, 3), xrange(1, 2)]
+    ```
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    output_position: tuple of size N: `(p_out1, ..., p_outN)`,
+                     a single position in the output of the convolution.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    N ranges `[[p_in_left1, ..., p_in_right1], ...,
+              [p_in_leftN, ..., p_in_rightN]]` specifying the region in the
+    input connected to output_position.
+  """
+  ranges = []
+
+  ndims = len(input_shape)
+  for d in range(ndims):
+    left_shift = int(kernel_shape[d] / 2)
+    right_shift = kernel_shape[d] - left_shift
+
+    center = output_position[d] * strides[d]
+
+    if padding == 'valid':
+      center += left_shift
+
+    start = max(0, center - left_shift)
+    end = min(input_shape[d], center + right_shift)
+
+    ranges.append(range(start, end))
+
+  return ranges
+
+
+def conv_output_shape(input_shape, kernel_shape, strides, padding):
+  """Return the output shape of an N-D convolution.
+
+  Forces dimensions where input is empty (size 0) to remain empty.
+
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
+  """
+  dims = range(len(kernel_shape))
+  output_shape = [conv_output_length(input_shape[d],
+                                     kernel_shape[d],
+                                     padding,
+                                     strides[d])
+                  for d in dims]
+  output_shape = tuple([0 if input_shape[d] == 0 else output_shape[d]
+                        for d in dims])
+  return output_shape
diff --git a/tensorflow/python/keras/utils/conv_utils_test.py b/tensorflow/python/keras/utils/conv_utils_test.py
new file mode 100644
index 0000000..eb2a360
--- /dev/null
+++ b/tensorflow/python/keras/utils/conv_utils_test.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conv_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.platform import test
+
+
+def _get_const_output_shape(input_shape, dim):
+  return tuple([min(d, dim) for d in input_shape])
+
+
+input_shapes = [
+    (0,),
+    (0, 0),
+    (1,),
+    (2,),
+    (3,),
+    (1, 0),
+    (0, 3),
+    (1, 1),
+    (1, 2),
+    (3, 1),
+    (2, 2),
+    (3, 3),
+    (1, 0, 1),
+    (5, 2, 3),
+    (3, 5, 6, 7, 0),
+    (3, 2, 2, 4, 4),
+    (1, 2, 3, 4, 7, 2),
+]
+
+
+@parameterized.parameters(input_shapes)
+class TestConvUtils(test.TestCase, parameterized.TestCase):
+
+  def test_conv_kernel_mask_fc(self, *input_shape):
+    padding = 'valid'
+    kernel_shape = input_shape
+    ndims = len(input_shape)
+    strides = (1,) * ndims
+    output_shape = _get_const_output_shape(input_shape, dim=1)
+    mask = np.ones(input_shape + output_shape, np.bool)
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_diag(self, *input_shape):
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = (1,) * ndims
+
+    for padding in ['valid', 'same']:
+      mask = np.identity(int(np.prod(input_shape)), np.bool)
+      mask = np.reshape(mask, input_shape * 2)
+      self.assertAllEqual(
+          mask,
+          conv_utils.conv_kernel_mask(
+              input_shape,
+              kernel_shape,
+              strides,
+              padding
+          )
+      )
+
+  def test_conv_kernel_mask_full_stride(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = tuple([max(d, 1) for d in input_shape])
+    output_shape = _get_const_output_shape(input_shape, dim=1)
+
+    mask = np.zeros(input_shape + output_shape, np.bool)
+    if all(d > 0 for d in mask.shape):
+      mask[(0,) * len(output_shape)] = True
+
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_almost_full_stride(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = tuple([max(d - 1, 1) for d in input_shape])
+    output_shape = _get_const_output_shape(input_shape, dim=2)
+
+    mask = np.zeros(input_shape + output_shape, np.bool)
+    if all(d > 0 for d in mask.shape):
+      for in_position in itertools.product(*[[0, d - 1] for d in input_shape]):
+        out_position = tuple([min(p, 1) for p in in_position])
+        mask[in_position + out_position] = True
+
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_rect_kernel(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    strides = (1,) * ndims
+
+    for d in range(ndims):
+      kernel_shape = [1] * ndims
+      kernel_shape[d] = input_shape[d]
+
+      output_shape = list(input_shape)
+      output_shape[d] = min(1, input_shape[d])
+
+      mask = np.identity(int(np.prod(input_shape)), np.bool)
+      mask = np.reshape(mask, input_shape * 2)
+
+      for p in itertools.product(*[range(input_shape[dim])
+                                   for dim in range(ndims)]):
+        p = list(p)
+        p[d] = slice(None)
+        mask[p * 2] = True
+
+      mask = np.take(mask, range(0, min(1, input_shape[d])), ndims + d)
+
+      self.assertAllEqual(
+          mask,
+          conv_utils.conv_kernel_mask(
+              input_shape,
+              kernel_shape,
+              strides,
+              padding
+          )
+      )
+
+  def test_conv_kernel_mask_wrong_padding(self, *input_shape):
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = (1,) * ndims
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'same'
+    )
+
+    self.assertRaises(NotImplementedError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'full')
+
+  def test_conv_kernel_mask_wrong_dims(self, *input_shape):
+    kernel_shape = 1
+    strides = 1
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+    ndims = len(input_shape)
+
+    kernel_shape = (2,) * (ndims + 1)
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'same')
+
+    strides = (1,) * ndims
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'valid')
+
+    kernel_shape = (1,) * ndims
+    strides = (2,) * (ndims - 1)
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'valid')
+
+    strides = (2,) * ndims
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 2451dc7..c84ed9d 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -73,6 +73,17 @@
 )
 
 tf_py_test(
+    name = "batch_gather_op_test",
+    srcs = ["batch_gather_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+tf_py_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
@@ -949,6 +960,17 @@
 )
 
 tf_py_test(
+    name = "string_length_op_test",
+    size = "small",
+    srcs = ["string_length_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
@@ -2181,7 +2203,6 @@
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
     ],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 4056757..81442d1 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -245,6 +245,7 @@
         array_ops.boolean_mask(tensor, mask).eval()
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OperatorShapeTest(test_util.TensorFlowTestCase):
 
   def testExpandScalar(self):
@@ -262,7 +263,8 @@
     matrix_squeezed = array_ops.squeeze(matrix, [0])
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        Exception, "Can not squeeze dim.1., expected a dimension of 1, got 3"):
       matrix_squeezed = array_ops.squeeze(matrix, [1])
 
   def testSqueezeScalarDim(self):
@@ -270,6 +272,11 @@
     matrix_squeezed = array_ops.squeeze(matrix, 0)
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
+  def testExpandDimsWithNonScalarDim(self):
+    with self.assertRaisesRegexp(Exception,
+                                 "must be a tensor with a single value"):
+      array_ops.expand_dims(1, axis=[0, 1])
+
 
 class ReverseV2Test(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 94ed8eb..51aa17b 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -160,7 +160,7 @@
     complex_inputs_ = [(x + (x + 1) * 1j) for x in float_inputs_]
 
     with self.test_session():
-      for dtype in (dtypes.complex64,):
+      for dtype in (dtypes.complex64, dtypes.complex128):
         input_ = array_ops.placeholder(dtype)
 
         def clean_nans(s_l):
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
new file mode 100644
index 0000000..8e7ae89
--- /dev/null
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -0,0 +1,116 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
+
+
+class GatherTest(test.TestCase):
+
+  def _buildParams(self, data, dtype):
+    data = data.astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testSimpleGather(self):
+    data = np.array([0, 1, 2, 3, 7, 5, 8, 9, 10, 11, 15, 13])
+    indices = [3, 4]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        expected_result = np.array([3, 7])
+        np_val = self._buildParams(expected_result, dtype)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def test2DArray(self):
+    data = np.array([[0, 1, 2, 3, 7, 5], [8, 9, 10, 11, 15, 13]])
+    indices = [[3], [4]]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        expected_result = np.array([[3], [15]])
+        np_val = self._buildParams(expected_result, dtype)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def testHigherRank(self):
+    data = np.array([[[0, 1, 2], [3, 7, 5]], [[8, 9, 10], [11, 15, 13]]])
+    indices = [[[2, 0], [1, 2]], [[2, 0], [0, 1]]]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        gather_val = gather_t.eval()
+        expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
+        np_val = self._buildParams(expected_result, dtype)
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def testString(self):
+    params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
+    with self.test_session():
+      indices_tf = constant_op.constant([1])
+      self.assertAllEqual([[b"qwer", b"uiop"]],
+                          array_ops.batch_gather(params, indices_tf).eval())
+
+  def testUnknownIndices(self):
+    params = constant_op.constant([[0, 1, 2]])
+    indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
+    gather_t = array_ops.batch_gather(params, indices)
+    self.assertEqual([1, None], gather_t.get_shape().as_list())
+
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
+        array_ops.batch_gather(params, [7]).eval()
+
+  def testEmptySlices(self):
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        for itype in np.int32, np.int64:
+          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+          indices = np.array([3, 4], dtype=itype)
+          gather = array_ops.batch_gather(params, indices)
+          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index fb52d10..400d38b 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -22,6 +22,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -369,6 +370,21 @@
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  def testClipByGlobalNormInf(self):
+    with self.test_session(use_gpu=True):
+      x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
+                                shape=[2, 3])
+      x1 = constant_op.constant([1.0, -2.0])
+      clip_norm = 6.0
+
+      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
+        norm.eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
+        ans[0].eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
+        ans[1].eval()
+
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 97ce245..b991013 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -78,6 +78,20 @@
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  def testMultipleOutputs(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return x * y, y
+
+    def false_fn():
+      return x, y * 3.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
   def testBasic2(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -104,8 +118,8 @@
 
       out = cond_v2.cond_v2(pred, true_fn, false_fn)
 
-      self.assertEqual(sess.run(out, {pred: True}), [1.0])
-      self.assertEqual(sess.run(out, {pred: False}), [2.0])
+      self.assertEqual(sess.run(out, {pred: True}), (1.0,))
+      self.assertEqual(sess.run(out, {pred: False}), (2.0,))
 
   def _createCond(self, name):
     pred = constant_op.constant(True, name="pred")
@@ -243,6 +257,32 @@
     run_test(True)
     run_test(False)
 
+  def testNestedCondBothBranches(self):
+
+    def run_test(pred_value):
+
+      def build_graph():
+        pred = array_ops.placeholder(dtypes.bool, name="pred")
+        x = constant_op.constant(1.0, name="x")
+        y = constant_op.constant(2.0, name="y")
+
+        def true_fn():
+          return _cond(pred, lambda: x + y, lambda: x * x, name=None)
+
+        def false_fn():
+          return _cond(pred, lambda: x - y, lambda: y * y, name=None)
+
+        return x, y, pred, true_fn, false_fn
+
+      with ops.Graph().as_default():
+        x, y, pred, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x, y], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [x], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [y], {pred: pred_value})
+
+    run_test(True)
+    run_test(False)
+
   def testDoubleNestedCond(self):
 
     def run_test(pred1_value, pred2_value):
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index ae68753..93f5323 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -448,7 +448,7 @@
       }
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
-          "Tried to explicitly squeeze dimension 2"):
+          "Can not squeeze dim\[2\]"):
         dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
@@ -475,7 +475,7 @@
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
-          "Tried to explicitly squeeze dimension 2"):
+          "Can not squeeze dim\[2\]"):
         dynamic_predictions.eval(feed_dict=feed_dict)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index b567b71..1a29d08 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -647,7 +647,8 @@
     # feeding into the fill is dominated by a Switch.
     zero = graph.get_operation_by_name("gradients/zeros/Const")
     self.assertEqual(len(zero.control_inputs), 1)
-    self.assertEqual(zero.control_inputs[0].type, "Switch")
+    self.assertEqual(zero.control_inputs[0].type, "Identity")
+    self.assertEqual(zero.control_inputs[0].inputs[0].op.type, "Switch")
 
   def testCondGrad_2(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 24800d2..5db2e98 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -978,6 +978,8 @@
       self.assertAllEqual(sess.run(bvals), [17., 16.])
 
 
+# TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
+# below test cases.
 class PartitionedCallTest(test.TestCase):
 
   def testBasicSingleDevice(self):
@@ -1053,7 +1055,7 @@
     self.assertEqual(output, 6.)
 
   def testShardsRunOnRequestedDevices(self):
-    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    config = config_pb2.ConfigProto(device_count={"CPU": 4})
 
     @function.Defun()
     def Body():
@@ -1073,13 +1075,30 @@
       with ops.device("/cpu:2"):
         s3 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
-      return s1, s2, s3
+      with ops.device(""):
+        # TODO(akshayka): This is unfortunate and brittle. It prevents
+        # `Iterator.from_structure` from assigning the iterator op to 'cpu:0'.
+        #  Remove this hack once we have a way of obtaining metadata about
+        #  function execution.
+        s4 = iterator_ops.Iterator.from_structure(
+            (dtypes.float32,)).string_handle()
+      return s1, s2, s3, s4
 
-    with self.test_session(config=config):
-      outputs = functional_ops.partitioned_call(args=[], f=Body)
-      self.assertTrue(compat.as_bytes("CPU:0") in outputs[0].eval())
-      self.assertTrue(compat.as_bytes("CPU:1") in outputs[1].eval())
-      self.assertTrue(compat.as_bytes("CPU:2") in outputs[2].eval())
+    with self.test_session(config=config, use_gpu=True) as sess:
+      with ops.device("/cpu:3"):
+        outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
+    self.assertIn(compat.as_bytes("CPU:0"), outputs[0])
+    self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
+    self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
+    self.assertIn(compat.as_bytes("CPU:3"), outputs[3])
+
+    with self.test_session(config=config, use_gpu=True):
+      with ops.device("/cpu:0"):
+        outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
+    self.assertIn(compat.as_bytes("CPU:0"), outputs[0])
+    self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
+    self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
+    self.assertIn(compat.as_bytes("CPU:0"), outputs[3])
 
   def testAssignAddResourceVariable(self):
 
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index bf82e08..3193222 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -421,6 +421,31 @@
                                  "Invalid data type at index 0"):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testZerosLike(self):
+    for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
+                  dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
+                  dtypes.float64, dtypes.complex64, dtypes.complex128,
+                  dtypes.bool):
+      l_empty = list_ops.empty_tensor_list(
+          element_dtype=dtype, element_shape=scalar_shape())
+      l_empty_zeros = array_ops.zeros_like(l_empty)
+      t_empty_zeros = list_ops.tensor_list_stack(
+          l_empty_zeros, element_dtype=dtype)
+
+      l_full = list_ops.tensor_list_push_back(l_empty,
+                                              math_ops.cast(0, dtype=dtype))
+      l_full = list_ops.tensor_list_push_back(l_full,
+                                              math_ops.cast(1, dtype=dtype))
+      l_full_zeros = array_ops.zeros_like(l_full)
+      t_full_zeros = list_ops.tensor_list_stack(
+          l_full_zeros, element_dtype=dtype)
+
+      self.assertAllEqual(self.evaluate(t_empty_zeros), [])
+      self.assertAllEqual(
+          self.evaluate(t_full_zeros), np.zeros(
+              (2,), dtype=dtype.as_numpy_dtype))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index f5c6255..ba9359d 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -25,12 +25,15 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
 
 
 class PartitionerCreatorsTest(test.TestCase):
@@ -543,32 +546,6 @@
         partitioned_variables.create_partitioned_variables(
             [10, 43], [1, 50], rnd.initialized_value())
 
-  def testControlDepsNone(self):
-    with self.test_session() as session:
-      c = constant_op.constant(1.0)
-      with ops.control_dependencies([c]):
-        # d get the control dependency.
-        d = constant_op.constant(2.0)
-        # Partitioned variables do not.
-        var_x = variable_scope.get_variable(
-            "x",
-            shape=[2],
-            initializer=init_ops.ones_initializer(),
-            partitioner=partitioned_variables.variable_axis_size_partitioner(4))
-
-        ops_before_read = session.graph.get_operations()
-        var_x.as_tensor()  # Caches the ops for subsequent reads.
-        reading_ops = [
-            op for op in session.graph.get_operations()
-            if op not in ops_before_read
-        ]
-
-      self.assertEqual([c.op], d.op.control_inputs)
-      # Tests that no control dependencies are added to reading a partitioned
-      # variable which is similar to reading a variable.
-      for op in reading_ops:
-        self.assertEqual([], op.control_inputs)
-
   def testConcat(self):
     with self.test_session() as session:
       var_x = variable_scope.get_variable(
@@ -594,6 +571,57 @@
       variables.global_variables_initializer().run()
       self.assertAllClose(value.eval(), var_x.as_tensor().eval())
 
+  def testVariableCreationInALoop(self):
+    """Tests the variable created inside a loop can be used outside the loop."""
+    with self.test_session():
+      with variable_scope.variable_scope("ascope") as scope:
+        def Body(i, _):
+          var_x = variable_scope.get_variable(
+              "x",
+              shape=[2],
+              initializer=init_ops.ones_initializer(),
+              partitioner=partitioned_variables.variable_axis_size_partitioner(
+                  4))
+          return (i + 1, var_x.as_tensor())
+
+        cond = lambda i, _: i < 2
+        _, x = control_flow_ops.while_loop(
+            cond, Body, (0, constant_op.constant([7, 8], dtypes.float32)))
+        variables.global_variables_initializer().run()
+        self.assertAllClose([1.0, 1.0], x.eval())
+
+        scope.reuse_variables()
+        var_x = variable_scope.get_variable(
+            "x",
+            shape=[2],
+            initializer=init_ops.ones_initializer(),
+            partitioner=partitioned_variables.variable_axis_size_partitioner(4))
+
+        self.assertAllClose([1.0, 1.0], var_x.as_tensor().eval())
+
+  def testReadInWhileLoop(self):
+    """Tests the value is current (not cached) when read within a loop."""
+    with self.test_session():
+      var_x = variable_scope.get_variable(
+          "x",
+          shape=[2],
+          initializer=init_ops.ones_initializer(),
+          partitioner=partitioned_variables.variable_axis_size_partitioner(4))
+
+      def Body(i, _):
+        # Use a SGD step to update the variable's value.
+        loss = math_ops.reduce_sum(var_x)
+        optimizer = gradient_descent.GradientDescentOptimizer(1.0)
+        minimize = optimizer.minimize(loss * 0.7)
+        with ops.control_dependencies([minimize]):
+          return (i + 1, var_x.as_tensor())
+
+      cond = lambda i, _: i < 2
+      _, x = control_flow_ops.while_loop(
+          cond, Body, (0, constant_op.constant([7, 8], dtypes.float32)))
+      variables.global_variables_initializer().run()
+      self.assertAllClose([-0.4, -0.4], x.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c739cd2..b1ef46f 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -835,6 +835,12 @@
       state_ops.scatter_add(v, [1], [3])
       self.assertAllEqual([1.0, 5.0], v.numpy())
 
+  def testScatterSubStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="sub")
+      state_ops.scatter_sub(v, [1], [3])
+      self.assertAllEqual([1.0, -1.0], v.numpy())
+
   def testScatterNdAddStateOps(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 2405f65..c72ada1 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -27,6 +27,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -299,6 +300,43 @@
       save.restore(sess, save_path)
       self.assertAllEqual([10.] * 4, self.evaluate(lstm_cell._bias))
 
+  def testRNNCellSerialization(self):
+    for cell in [
+        rnn_cell_impl.LSTMCell(32, use_peepholes=True, cell_clip=True),
+        rnn_cell_impl.BasicLSTMCell(32, dtype=dtypes.float32),
+        rnn_cell_impl.BasicRNNCell(32, activation="relu", dtype=dtypes.float32),
+        rnn_cell_impl.GRUCell(
+            32, kernel_initializer="ones", dtype=dtypes.float32)
+    ]:
+      with self.test_session():
+        x = keras.Input((None, 5))
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer="rmsprop", loss="mse")
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        # The custom_objects is important here since rnn_cell_impl is
+        # not visible as a Keras layer, and also has a name conflict with
+        # keras.LSTMCell and GRUCell.
+        layer = keras.layers.RNN.from_config(
+            config,
+            custom_objects={
+                "BasicRNNCell": rnn_cell_impl.BasicRNNCell,
+                "GRUCell": rnn_cell_impl.GRUCell,
+                "LSTMCell": rnn_cell_impl.LSTMCell,
+                "BasicLSTMCell": rnn_cell_impl.BasicLSTMCell
+            })
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
 ######### Benchmarking RNN code
 
 
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 419cd5e..3f9b029 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -174,6 +174,26 @@
     for dtype in _TEST_DTYPES:
       self._testHugeNumberOfTensorsVariable(dtype)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDegenerateVariable(self):
+    inp = np.random.rand(4, 4).astype("f")
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(array_ops.split(inp, [-1, 4], 0))
+      self.assertAllEqual(result[0], inp[0:0, :])
+      self.assertAllEqual(result[1], inp[0:4, :])
+
+      result = self.evaluate(array_ops.split(inp, [4, -1], 0))
+      self.assertAllEqual(result[0], inp[0:4, :])
+      self.assertAllEqual(result[1], inp[4:4, :])
+
+      result = self.evaluate(array_ops.split(inp, [-1, 4], 1))
+      self.assertAllEqual(result[0], inp[:, 0:0])
+      self.assertAllEqual(result[1], inp[:, 0:4])
+
+      result = self.evaluate(array_ops.split(inp, [4, -1], 1))
+      self.assertAllEqual(result[0], inp[:, 0:4])
+      self.assertAllEqual(result[1], inp[:, 4:4])
+
   def _testGradientsSimpleVariable(self, dtype):
     inp = self._makeData((4, 4), dtype)
     with test_util.device(use_gpu=True):
@@ -336,6 +356,16 @@
     for s in splits:
       self.assertEqual(None, s.get_shape().ndims)
 
+  def testVariableShapeFunction(self):
+    # size_splits too big
+    with self.assertRaises(ValueError):
+      array_ops.split([0, 1], [3, -1], axis=0)
+
+    # Correct inference of variable dimension
+    s0, s1 = array_ops.split([0, 1, 2], [2, -1], axis=0)
+    assert s0.shape.as_list() == [2]
+    assert s1.shape.as_list() == [1]
+
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
new file mode 100644
index 0000000..075a320
--- /dev/null
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -0,0 +1,37 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for string_length_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class StringLengthOpTest(test.TestCase):
+
+  def testStringLength(self):
+    strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
+
+    with self.test_session() as sess:
+      lengths = string_ops.string_length(strings)
+      values = sess.run(lengths)
+      self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index 17575da..29fb002 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -135,6 +135,15 @@
       tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
+  def testBatchSelect(self):
+    x = np.array([[-2, 3, -1] * 64, [1, -3, -3] * 64] * 8192)  # [16384, 192]
+    c_mat = np.array([[False] * 192, [True] * 192] * 8192)  # [16384, 192]
+    c_vec = np.array([False, True] * 8192)  # [16384]
+    np_val = np.where(c_mat, x * x, -x)
+    with self.test_session(use_gpu=True):
+      tf_val = array_ops.where(c_vec, x * x, -x).eval()
+    self.assertAllEqual(tf_val, np_val)
+
 
 class WhereBenchmark(test.Benchmark):
 
@@ -163,5 +172,32 @@
                 "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
           sys.stdout.flush()
 
+  def benchmarkBatchSelect(self):
+    for (m, n, use_gpu) in itertools.product([1000, 10000, 100000],
+                                             [10, 100, 1000], [False, True]):
+      name = "m_%d_n_%d_use_gpu_%s" % (m, n, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          x_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32)
+          y_gen = random_ops.random_uniform([m, n], dtype=dtypes.float32)
+          c_gen = random_ops.random_uniform([m], dtype=dtypes.float32) <= 0.5
+          x = resource_variable_ops.ResourceVariable(x_gen)
+          y = resource_variable_ops.ResourceVariable(y_gen)
+          c = resource_variable_ops.ResourceVariable(c_gen)
+          op = array_ops.where(c, x, y)
+        with session.Session() as sess:
+          x.initializer.run()
+          y.initializer.run()
+          c.initializer.run()
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          # approximate size of output: m*n*2 floats for each axis.
+          gb_processed = m * n * 8 / 1.0e9
+          throughput = gb_processed / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index cf13b52..ab08865 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -183,13 +183,13 @@
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
-        @{tf.VariableSynchronization}. By default the synchronization is set to
+        `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize. If `synchronization` is set to `ON_READ`,
         `trainable` must not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
-        @{tf.VariableAggregation}.
+        `tf.VariableAggregation`.
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
         into multiple partitions according to `partitioner`.  In this case,
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 261281a..9879e50 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -127,8 +127,8 @@
   """Functional interface for the densely-connected layer.
 
   This layer implements the operation:
-  `outputs = activation(inputs.kernel + bias)`
-  Where `activation` is the activation function passed as the `activation`
+  `outputs = activation(inputs * kernel + bias)`
+  where `activation` is the activation function passed as the `activation`
   argument (if not `None`), `kernel` is a weights matrix created by the layer,
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
@@ -203,7 +203,7 @@
       to be the same for all timesteps, you can use
       `noise_shape=[batch_size, 1, features]`.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}.
+      `tf.set_random_seed`.
       for behavior.
     name: The name of the layer (string).
   """
@@ -248,7 +248,7 @@
       to be the same for all timesteps, you can use
       `noise_shape=[batch_size, 1, features]`.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 7c10713..fc02d6d 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -507,6 +507,17 @@
       call.ins.push_back(ctx->input(i));
     }
 
+    // NOTE(mrry): There is a potential time-of-check-to-time-of-use race here.
+    // because it is possible that `Py_Finalize()` could be called in another
+    // thread between this check and the  call to `PyGILState_Ensure()`, which
+    // will abort the process if `Py_Finalize()` has been called. A more robust
+    // solution would be welcome, but it is not obvious how to make this work
+    // using the current Python C API.
+    OP_REQUIRES(ctx, Py_IsInitialized(),
+                errors::FailedPrecondition(
+                    "Python interpreter state is not initialized. "
+                    "The process may be terminated."));
+
     PyGILState_STATE py_threadstate;
     py_threadstate = PyGILState_Ensure();
     bool log_on_error;
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index 2ee898e..739cab4 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -18,6 +18,8 @@
 // Place `<locale>` before <Python.h> to avoid build failure in macOS.
 #include <locale>
 
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
 #include <Python.h>
 
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index 3c64813..e4e5268 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -52,10 +52,17 @@
   file_.reset();
 }
 
-bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
-  if (writer_ == nullptr) return false;
+void PyRecordWriter::WriteRecord(tensorflow::StringPiece record,
+                                 TF_Status* out_status) {
+  if (writer_ == nullptr) {
+    TF_SetStatus(out_status, TF_FAILED_PRECONDITION,
+                 "Writer not initialized or previously closed");
+    return;
+  }
   Status s = writer_->WriteRecord(record);
-  return s.ok();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+  }
 }
 
 void PyRecordWriter::Flush(TF_Status* out_status) {
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 9d66c03..61a4960 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -43,7 +43,7 @@
                              TF_Status* out_status);
   ~PyRecordWriter();
 
-  bool WriteRecord(tensorflow::StringPiece record);
+  void WriteRecord(tensorflow::StringPiece record, TF_Status* out_status);
   void Flush(TF_Status* out_status);
   void Close(TF_Status* out_status);
 
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index aec12ab..404423c 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -15,7 +15,7 @@
 
 """Python functions for directly manipulating TFRecord-formatted files.
 
-See the @{$python/python_io} guide.
+See the [Python IO](https://tensorflow.org/api_guides/python/python_io) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 941d6cd..2b3e986 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -125,8 +125,8 @@
     Args:
       record: str
     """
-    # TODO(sethtroisi): Failures are currently swallowed, change that.
-    self._writer.WriteRecord(record)
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.WriteRecord(record, status)
 
   def flush(self):
     """Flush the file."""
diff --git a/tensorflow/python/lib/io/tf_record_test.py b/tensorflow/python/lib/io/tf_record_test.py
index 4743c03..b853b64 100644
--- a/tensorflow/python/lib/io/tf_record_test.py
+++ b/tensorflow/python/lib/io/tf_record_test.py
@@ -358,12 +358,12 @@
     with self.assertRaises(errors_impl.FailedPreconditionError):
       self._writer.flush()
 
-  def testWriteAfterClose(self):
+  def testWriteAfterCloseIsError(self):
     self._writer.write(self._Record(0))
     self._writer.close()
 
-    # TODO(sethtroisi): No way to know this failed, changed that.
-    self._writer.write(self._Record(1))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      self._writer.write(self._Record(1))
 
 
 class TFRecordWriterCloseAndFlushGzipTests(TFRecordWriterCloseAndFlushTests):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ec6488e..1e23fff 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -15,7 +15,7 @@
 # Tests for this file live in python/kernel_tests/array_ops_test.py
 """Support for manipulating tensors.
 
-See the @{$python/array_ops} guide.
+See the [Array Ops](https://tensorflow.org/api_guides/python/array_ops) guide.
 """
 
 from __future__ import absolute_import
@@ -538,7 +538,7 @@
   words, `begin[i]` is the offset into the 'i'th dimension of `input` that you
   want to slice from.
 
-  Note that @{tf.Tensor.__getitem__} is typically a more pythonic way to
+  Note that `tf.Tensor.__getitem__` is typically a more pythonic way to
   perform slices, as it allows you to write `foo[3:7, :-2]` instead of
   `tf.slice(foo, [3, 0], [4, foo.get_shape()[1]-2])`.
 
@@ -594,7 +594,7 @@
 
   **Instead of calling this op directly most users will want to use the
   NumPy-style slicing syntax (e.g. `tensor[..., 3:4:-1, tf.newaxis, 3]`), which
-  is supported via @{tf.Tensor.__getitem__} and @{tf.Variable.__getitem__}.**
+  is supported via `tf.Tensor.__getitem__` and `tf.Variable.__getitem__`.**
   The interface of this op is a low-level encoding of the slicing syntax.
 
   Roughly speaking, this op extracts a slice of size `(end-begin)/stride`
@@ -723,7 +723,7 @@
   """Creates a slice helper object given a variable.
 
   This allows creating a sub-tensor from part of the current contents
-  of a variable. See @{tf.Tensor.__getitem__} for detailed examples
+  of a variable. See `tf.Tensor.__getitem__` for detailed examples
   of slicing.
 
   This function in addition also allows assignment to a sliced range.
@@ -2662,6 +2662,76 @@
 gather.__doc__ = gen_array_ops.gather_v2.__doc__
 
 
+@tf_export("batch_gather")
+def batch_gather(params, indices, name=None):
+  """Gather slices from `params` according to `indices` with leading batch dims.
+
+  This operation assumes that the leading dimensions of `indices` are dense,
+  and the gathers on the axis corresponding to the last dimension of `indices`.
+  More concretely it computes:
+
+  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+
+  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
+  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
+  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
+
+  In the case in which indices is a 1D tensor, this operation is equivalent to
+  `tf.gather`.
+
+  See also `tf.gather` and `tf.gather_nd`.
+
+  Args:
+    params: A Tensor. The tensor from which to gather values.
+    indices: A Tensor. Must be one of the following types: int32, int64. Index
+        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
+        last dimension of `indices` itself.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor. Has the same type as `params`.
+
+  Raises:
+    ValueError: if `indices` has an unknown shape.
+  """
+
+  with ops.name_scope(name):
+    indices = ops.convert_to_tensor(indices, name="indices")
+    params = ops.convert_to_tensor(params, name="params")
+    indices_shape = shape(indices)
+    params_shape = shape(params)
+    ndims = indices.shape.ndims
+    if ndims is None:
+      raise ValueError("batch_gather does not allow indices with unknown "
+                       "shape.")
+    batch_indices = indices
+    accum_dim_value = 1
+    for dim in range(ndims-1, 0, -1):
+      dim_value = params_shape[dim-1]
+      accum_dim_value *= params_shape[dim]
+      dim_indices = gen_math_ops._range(0, dim_value, 1)
+      dim_indices *= accum_dim_value
+      dim_shape = stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
+                        axis=0)
+      batch_indices += reshape(dim_indices, dim_shape)
+
+    flat_indices = reshape(batch_indices, [-1])
+    outer_shape = params_shape[ndims:]
+    flat_inner_shape = gen_math_ops.prod(
+        params_shape[:ndims], [0], False)
+
+    flat_params = reshape(
+        params, concat([[flat_inner_shape], outer_shape], axis=0))
+    flat_result = gather(flat_params, flat_indices)
+    result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
+    final_shape = indices.get_shape()[:ndims-1].merge_with(
+        params.get_shape()[:ndims -1])
+    final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
+    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
+    result.set_shape(final_shape)
+    return result
+
+
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
 @tf_export("quantize_v2")
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 375a5ec..c5a0f29 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -15,7 +15,8 @@
 # pylint: disable=g-short-docstring-punctuation
 """Asserts and Boolean Checks.
 
-See the @{$python/check_ops} guide.
+See the [Asserts and
+checks](https://tensorflow.org/api_guides/python/check_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 75c459a..78b395a 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -29,6 +29,7 @@
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import numerics
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -42,6 +43,9 @@
   Any values less than `clip_value_min` are set to `clip_value_min`. Any values
   greater than `clip_value_max` are set to `clip_value_max`.
 
+  Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for
+  correct results.
+
   Args:
     t: A `Tensor`.
     clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
@@ -54,7 +58,7 @@
     A clipped `Tensor`.
 
   Raises:
-    ValueError: if the clip tensors would trigger array broadcasting
+    ValueError: If the clip tensors would trigger array broadcasting
       that would make the returned tensor larger than the input.
   """
   with ops.name_scope(name, "clip_by_value",
@@ -243,6 +247,7 @@
 
   Raises:
     TypeError: If `t_list` is not a sequence.
+    InvalidArgumentError: If global norm is not finite.
   """
   if (not isinstance(t_list, collections.Sequence)
       or isinstance(t_list, six.string_types)):
@@ -250,6 +255,8 @@
   t_list = list(t_list)
   if use_norm is None:
     use_norm = global_norm(t_list, name)
+  use_norm = numerics.verify_tensor_all_finite(use_norm,
+                                               "Found Inf or NaN global norm.")
 
   with ops.name_scope(name, "clip_by_global_norm",
                       t_list + [clip_norm]) as name:
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index 44c5c05..b3dacff 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -65,20 +65,27 @@
     caller_colocation_stack = ops.get_default_graph()._colocation_stack
     caller_container = ops.get_default_graph()._container
     caller_collection_ref = ops.get_default_graph()._collections
+
+    with ops.name_scope(None):
+      # Find the outer most graph for uniquing function names.
+      # TODO(jpienaar): Make this work in eager mode.
+      graph = ops.get_default_graph()
+      while isinstance(graph, _function._FuncGraph):
+        graph = graph._outer_graph
+
+      true_name = graph.unique_name(("%strue" % scope).replace("/", "_"))
+      false_name = graph.unique_name(("%sfalse" % scope).replace("/", "_"))
     # pylint: enable=protected-access
-
-    func_name_prefix = scope.replace("/", "_")
-
     true_graph = _function.func_graph_from_py_func(
         true_fn, [], [],
-        name="%strue" % func_name_prefix,
+        name=true_name,
         device=caller_device,
         colocation_stack=caller_colocation_stack,
         collections_ref=caller_collection_ref,
         container=caller_container)
     false_graph = _function.func_graph_from_py_func(
         false_fn, [], [],
-        name="%sfalse" % func_name_prefix,
+        name=false_name,
         device=caller_device,
         colocation_stack=caller_colocation_stack,
         collections_ref=caller_collection_ref,
@@ -132,7 +139,7 @@
                       attr_value_pb2.AttrValue(b=True))
       # pylint: enable=protected-access
 
-    return tensors[:num_cond_outputs]
+    return tuple(tensors[:num_cond_outputs])
 
 
 @ops.RegisterGradient("If")
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index c7061b3..d1095c8 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -14,7 +14,8 @@
 # ==============================================================================
 """Control Flow Operations.
 
-See the @{$python/control_flow_ops} guide.
+See the [Control
+Flow](https://tensorflow.org/api_guides/python/control_flow_ops) guide.
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -1449,14 +1450,17 @@
       pred = op_ctxt.pred
       branch = op_ctxt.branch
       switch_val = switch(op.inputs[0], pred)[1 - branch]
+      # A op is created along the branch taken as control dependencies are on
+      # the whole op and not on the tensor output.
+      pivot = array_ops.identity(switch_val)
       if val.dtype == dtypes.resource:
-        with ops.control_dependencies([switch_val]):
+        with ops.control_dependencies([pivot]):
           return array_ops.zeros(
               gen_resource_variable_ops.variable_shape(switch_val))
       zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
       # Ensure ops created within array_ops.zeros are dominated by switch in
       # cond context.
-      with ops.control_dependencies([switch_val]):
+      with ops.control_dependencies([pivot]):
         return array_ops.zeros(zeros_shape, dtype=val.dtype)
     else:
       return array_ops.zeros_like(val, optimize=False)
@@ -2065,21 +2069,25 @@
 
     # Build the graph for the true branch in a new context.
     context_t = CondContext(pred, pivot_1, branch=1)
-    context_t.Enter()
-    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
-    if orig_res_t is None:
-      raise ValueError("true_fn must have a return value.")
-    context_t.ExitResult(res_t)
-    context_t.Exit()
+    try:
+      context_t.Enter()
+      orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
+      if orig_res_t is None:
+        raise ValueError("true_fn must have a return value.")
+      context_t.ExitResult(res_t)
+    finally:
+      context_t.Exit()
 
     # Build the graph for the false branch in a new context.
     context_f = CondContext(pred, pivot_2, branch=0)
-    context_f.Enter()
-    orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
-    if orig_res_f is None:
-      raise ValueError("false_fn must have a return value.")
-    context_f.ExitResult(res_f)
-    context_f.Exit()
+    try:
+      context_f.Enter()
+      orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
+      if orig_res_f is None:
+        raise ValueError("false_fn must have a return value.")
+      context_f.ExitResult(res_f)
+    finally:
+      context_f.Exit()
 
     if not strict:
       orig_res_t = _UnpackIfSingleton(orig_res_t)
@@ -3069,7 +3077,7 @@
   `loop_vars` is the same in every iteration. The `shape_invariants` argument
   allows the caller to specify a less specific shape invariant for each loop
   variable, which is needed if the shape varies between iterations. The
-  @{tf.Tensor.set_shape}
+  `tf.Tensor.set_shape`
   function may also be used in the `body` function to indicate that
   the output loop variable has a particular shape. The shape invariant for
   SparseTensor and IndexedSlices are treated specially as follows:
@@ -3320,7 +3328,7 @@
   no guarantee that `output_tensor` will be evaluated after any `dependencies`
   have run.
 
-  See also @{tf.tuple$tuple} and @{tf.group$group}.
+  See also `tf.tuple` and `tf.group`.
 
   Args:
     dependencies: Iterable of operations to run before this op finishes.
@@ -3365,8 +3373,8 @@
   When this op finishes, all ops in `inputs` have finished. This op has no
   output.
 
-  See also @{tf.tuple$tuple} and
-  @{tf.control_dependencies$control_dependencies}.
+  See also `tf.tuple` and
+  `tf.control_dependencies`.
 
   Args:
     *inputs: Zero or more tensors to group.
@@ -3435,8 +3443,8 @@
   returned by `tuple` are only available after all the parallel computations
   are done.
 
-  See also @{tf.group$group} and
-  @{tf.control_dependencies$control_dependencies}.
+  See also `tf.group` and
+  `tf.control_dependencies`.
 
   Args:
     tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 9f77a6c..871f236 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -73,7 +73,7 @@
   With this definition, the gradient at x=100 will be correctly evaluated as
   1.0.
 
-  See also @{tf.RegisterGradient} which registers a gradient function for a
+  See also `tf.RegisterGradient` which registers a gradient function for a
   primitive TensorFlow operation. `tf.custom_gradient` on the other hand allows
   for fine grained control over the gradient computation of a sequence of
   operations.
@@ -100,7 +100,7 @@
 
   Returns:
     A function `h(x)` which returns the same value as `f(x)[0]` and whose
-    gradient (as calculated by @{tf.gradients}) is determined by `f(x)[1]`.
+    gradient (as calculated by `tf.gradients`) is determined by `f(x)[1]`.
   """
 
   def decorated(*args, **kwargs):
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index abf597c..7af2ca5 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -126,8 +126,8 @@
   handle single elements, versions that support enqueuing and
   dequeuing a batch of elements at once.
 
-  See @{tf.FIFOQueue} and
-  @{tf.RandomShuffleQueue} for concrete
+  See `tf.FIFOQueue` and
+  `tf.RandomShuffleQueue` for concrete
   implementations of this class, and instructions on how to create
   them.
   """
@@ -309,12 +309,12 @@
     until the element has been enqueued.
 
     At runtime, this operation may raise an error if the queue is
-    @{tf.QueueBase.close} before or during its execution. If the
+    `tf.QueueBase.close` before or during its execution. If the
     queue is closed before this operation runs,
     `tf.errors.CancelledError` will be raised. If this operation is
     blocked, and either (i) the queue is closed by a close operation
     with `cancel_pending_enqueues=True`, or (ii) the session is
-    @{tf.Session.close},
+    `tf.Session.close`,
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -352,12 +352,12 @@
     until all of the elements have been enqueued.
 
     At runtime, this operation may raise an error if the queue is
-    @{tf.QueueBase.close} before or during its execution. If the
+    `tf.QueueBase.close` before or during its execution. If the
     queue is closed before this operation runs,
     `tf.errors.CancelledError` will be raised. If this operation is
     blocked, and either (i) the queue is closed by a close operation
     with `cancel_pending_enqueues=True`, or (ii) the session is
-    @{tf.Session.close},
+    `tf.Session.close`,
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -413,11 +413,11 @@
     until there is an element to dequeue.
 
     At runtime, this operation may raise an error if the queue is
-    @{tf.QueueBase.close} before or during its execution. If the
+    `tf.QueueBase.close` before or during its execution. If the
     queue is closed, the queue is empty, and there are no pending
     enqueue operations that can fulfill this request,
     `tf.errors.OutOfRangeError` will be raised. If the session is
-    @{tf.Session.close},
+    `tf.Session.close`,
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -455,11 +455,11 @@
     `OutOfRange` exception is raised.
 
     At runtime, this operation may raise an error if the queue is
-    @{tf.QueueBase.close} before or during its execution. If the
+    `tf.QueueBase.close` before or during its execution. If the
     queue is closed, the queue contains fewer than `n` elements, and
     there are no pending enqueue operations that can fulfill this
     request, `tf.errors.OutOfRangeError` will be raised. If the
-    session is @{tf.Session.close},
+    session is `tf.Session.close`,
     `tf.errors.CancelledError` will be raised.
 
     Args:
@@ -500,7 +500,7 @@
 
     If the queue is closed and there are more than `0` but fewer than
     `n` elements remaining, then instead of raising a
-    `tf.errors.OutOfRangeError` like @{tf.QueueBase.dequeue_many},
+    `tf.errors.OutOfRangeError` like `tf.QueueBase.dequeue_many`,
     less than `n` elements are returned immediately.  If the queue is
     closed and there are `0` elements left in the queue, then a
     `tf.errors.OutOfRangeError` is raised just like in `dequeue_many`.
@@ -608,7 +608,7 @@
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
-  See @{tf.QueueBase} for a description of the methods on
+  See `tf.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -657,7 +657,7 @@
         with the same length as `dtypes`, or `None`.  If specified the dequeue
         methods return a dictionary with the names as keys.
       seed: A Python integer. Used to create a random seed. See
-        @{tf.set_random_seed}
+        `tf.set_random_seed`
         for behavior.
       shared_name: (Optional.) If non-empty, this queue will be shared under
         the given name across multiple sessions.
@@ -693,7 +693,7 @@
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
-  See @{tf.QueueBase} for a description of the methods on
+  See `tf.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -753,7 +753,7 @@
   A `PaddingFIFOQueue` may contain components with dynamic shape, while also
   supporting `dequeue_many`.  See the constructor for more details.
 
-  See @{tf.QueueBase} for a description of the methods on
+  See `tf.QueueBase` for a description of the methods on
   this class.
   """
 
@@ -824,7 +824,7 @@
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
-  See @{tf.QueueBase} for a description of the methods on
+  See `tf.QueueBase` for a description of the methods on
   this class.
   """
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index c03ef96..ddf9442 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -526,8 +526,8 @@
     # Remove "self", "__class__", or other special variables. These can appear
     # if the subclass used:
     # `parameters = dict(locals())`.
-    return dict((k, v) for k, v in self._parameters.items()
-                if not k.startswith("__") and k != "self")
+    return {k: v for k, v in self._parameters.items()
+            if not k.startswith("__") and k != "self"}
 
   @property
   def reparameterization_type(self):
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 27c2fa7..7b9e7de 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -253,7 +253,7 @@
 
   This function is used to perform parallel lookups on the list of
   tensors in `params`.  It is a generalization of
-  @{tf.gather}, where `params` is
+  `tf.gather`, where `params` is
   interpreted as a partitioning of a large embedding tensor.  `params` may be
   a `PartitionedVariable` as returned by using `tf.get_variable()` with a
   partitioner.
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 4ecc746..a6be826 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -15,7 +15,8 @@
 
 """Functional operations.
 
-See the @{$python/functional_ops} guide.
+See the [Higher Order
+Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index b64a66b..a68f6802 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -653,9 +653,6 @@
 
     # Initialize the pending count for ops in the connected subgraph from ys
     # to the xs.
-    if len(ys) > 1:
-      ys = [array_ops.identity(y) if _Consumers(y, func_graphs) else y
-            for y in ys]
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index e86a8e5..7291e05 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
 """Histograms.
-
-Please see @{$python/histogram_ops} guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 343531a..3de46e7 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -16,7 +16,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Image processing and decoding ops.
 
-See the @{$python/image} guide.
+See the [Images](https://tensorflow.org/api_guides/python/image) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 855a4d0..1235694 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -265,7 +265,7 @@
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
 
   Returns:
@@ -287,7 +287,7 @@
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
 
   Returns:
@@ -307,7 +307,7 @@
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
-        @{tf.set_random_seed}
+        `tf.set_random_seed`
         for behavior.
       scope_name: Name of the scope in which the ops are added.
 
@@ -948,7 +948,7 @@
 
   Resized images will be distorted if their original aspect ratio is not
   the same as `size`.  To avoid distortions see
-  @{tf.image.resize_image_with_pad}.
+  `tf.image.resize_image_with_pad`.
 
   `method` can be one of:
 
@@ -1167,7 +1167,7 @@
     _ImageDimensions(padded, rank=4)
 
     if not is_batch:
-      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+      padded = array_ops.squeeze(padded, axis=[0])
 
     return padded
 
@@ -1227,7 +1227,7 @@
     image: An image.
     max_delta: float, must be non-negative.
     seed: A Python integer. Used to create a random seed. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
 
   Returns:
@@ -1255,7 +1255,7 @@
     lower: float.  Lower bound for the random contrast factor.
     upper: float.  Upper bound for the random contrast factor.
     seed: A Python integer. Used to create a random seed. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
 
   Returns:
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index cf97618..2c61bb2 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1410,6 +1410,14 @@
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
+  def testContrastFactorShape(self):
+    x_shape = [1, 2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      image_ops.adjust_contrast(x_np, [2.0])
+
 
 class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
@@ -1956,7 +1964,7 @@
           "all dims of 'image.shape' must be > 0",
           use_tensor_inputs_options=[False])
 
-      # The orignal error message does not contain back slashes. However, they
+      # The original error message does not contain back slashes. However, they
       # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
@@ -2985,7 +2993,7 @@
           "all dims of 'image.shape' must be > 0",
           use_tensor_inputs_options=[False])
 
-      # The orignal error message does not contain back slashes. However, they
+      # The original error message does not contain back slashes. However, they
       # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
@@ -3201,7 +3209,8 @@
   def testExisting(self):
     # Read some real PNGs, converting to different channel numbers
     prefix = "tensorflow/core/lib/png/testdata/"
-    inputs = (1, "lena_gray.png"), (4, "lena_rgba.png")
+    inputs = ((1, "lena_gray.png"), (4, "lena_rgba.png"),
+              (3, "lena_palette.png"), (4, "lena_palette_trns.png"))
     for channels_in, filename in inputs:
       for channels in 0, 1, 3, 4:
         with self.test_session(use_gpu=True) as sess:
@@ -3649,6 +3658,41 @@
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
 
 
+class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
+
+  def testSelectFromThreeClusters(self):
+    boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+    scores_np = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+    max_output_size_np = 5
+    iou_threshold_np = 0.5
+    boxes = constant_op.constant(boxes_np)
+    scores = constant_op.constant(scores_np)
+    max_output_size = constant_op.constant(max_output_size_np)
+    iou_threshold = constant_op.constant(iou_threshold_np)
+    selected_indices_padded, num_valid_padded = \
+        image_ops.non_max_suppression_padded(
+            boxes,
+            scores,
+            max_output_size,
+            iou_threshold,
+            pad_to_max_output_size=True)
+    selected_indices, num_valid = image_ops.non_max_suppression_padded(
+        boxes,
+        scores,
+        max_output_size,
+        iou_threshold,
+        pad_to_max_output_size=False)
+    # The output shape of the padded operation must be fully defined.
+    self.assertEqual(selected_indices_padded.shape.is_fully_defined(), True)
+    self.assertEqual(selected_indices.shape.is_fully_defined(), False)
+    with self.test_session():
+      self.assertAllClose(selected_indices_padded.eval(), [3, 0, 5, 0, 0])
+      self.assertEqual(num_valid_padded.eval(), 3)
+      self.assertAllClose(selected_indices.eval(), [3, 0, 5])
+      self.assertEqual(num_valid.eval(), 3)
+
+
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
   """Tests utility function used by ssim() and psnr()."""
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index c315722..4d75ee3 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -238,7 +238,7 @@
     maxval: A python scalar or a scalar tensor. Upper bound of the range
       of random values to generate.  Defaults to 1 for float types.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type.
   """
@@ -276,7 +276,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -319,7 +319,7 @@
     stddev: a python scalar or a scalar tensor. Standard deviation of the
       random values to generate.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -369,7 +369,7 @@
   Args:
     factor: Float.  A multiplicative factor by which the values will be scaled.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
   """
@@ -427,7 +427,7 @@
     mode: One of "fan_in", "fan_out", "fan_avg".
     distribution: Random distribution to use. One of "normal", "uniform".
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
@@ -517,7 +517,7 @@
   Args:
     gain: multiplicative factor to apply to the orthogonal matrix
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type.
   """
@@ -572,7 +572,7 @@
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
     dtype: The data type.
   """
 
@@ -628,7 +628,7 @@
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
     dtype: The data type.
   """
 
@@ -693,7 +693,7 @@
       This has the effect of scaling the output 2-norm by a factor of
       `sqrt(gain)`.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
     dtype: The data type.
   """
 
@@ -829,7 +829,7 @@
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type.
   """
@@ -946,7 +946,7 @@
       The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
       applying this convolution.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed} for behavior.
+      `tf.set_random_seed` for behavior.
     dtype: The data type.
   """
 
@@ -1150,7 +1150,7 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
@@ -1175,7 +1175,7 @@
 
   Args:
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     dtype: The data type. Only floating point types are supported.
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index b5274ef..fbc1350 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -16,7 +16,8 @@
 # pylint: disable=line-too-long
 """Inputs and Readers.
 
-See the @{$python/io_ops} guide.
+See the [Inputs and
+Readers](https://tensorflow.org/api_guides/python/io_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 66633c8..8065397 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -190,10 +190,10 @@
     When calculating the gradient of a weighted loss contributions from
     both `losses` and `weights` are considered. If your `weights` depend
     on some model parameters but you do not want this to affect the loss
-    gradient, you need to apply @{tf.stop_gradient} to `weights` before
+    gradient, you need to apply `tf.stop_gradient` to `weights` before
     passing them to `compute_weighted_loss`.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -266,7 +266,7 @@
       `labels` or if the shape of `weights` is invalid or if `labels`
       or `predictions` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -317,7 +317,7 @@
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -369,7 +369,7 @@
     ValueError: If the shapes of `logits` and `labels` don't match or
       if `labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -437,7 +437,7 @@
       if the shape of `weights` is invalid.  Also if `labels` or
      `predictions` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -503,7 +503,7 @@
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -571,7 +571,7 @@
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -654,7 +654,7 @@
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -711,7 +711,7 @@
       `multi_class_labels` or if the shape of `weights` is invalid, or if
       `weights` is None.  Also if `multi_class_labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -777,7 +777,7 @@
       or if the shape of `weights` is invalid or if `weights` is None.  Also if
       `onehot_labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -894,7 +894,7 @@
     ValueError: If the shapes of `logits`, `labels`, and `weights` are
       incompatible, or if any of them are None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index f0c6bd5..2a7a2fd 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -972,6 +972,24 @@
                   grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
+@ops.RegisterGradient("UnsafeDiv")
+def _UnsafeDivGrad(op, grad):
+  """UnsafeDiv op gradient."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  x = math_ops.conj(x)
+  y = math_ops.conj(y)
+  return (array_ops.reshape(
+      math_ops.reduce_sum(math_ops.unsafe_div(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(
+                  grad * math_ops.unsafe_div(math_ops.unsafe_div(-x, y), y),
+                  ry), sy))
+
+
 @ops.RegisterGradient("Pow")
 def _PowGrad(op, grad):
   """Returns grad * (y*x^(y-1), z*log(x))."""
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index fa47b8f..f9bb60e 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -230,5 +231,27 @@
       self.assertLess(error, 1e-4)
 
 
+class UnsafeDivGradientTest(test.TestCase):
+
+  def testBasicGradient(self):
+    inputs = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
+    outputs = math_ops.unsafe_div(inputs, 1 + math_ops.abs(inputs))
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.get_shape().as_list(), outputs,
+          outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+  def testGradientWithDenominatorIsZero(self):
+    x = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
+    y = array_ops.zeros_like(x, dtype=dtypes.float32)
+    outputs = math_ops.unsafe_div(x, y)
+    with self.test_session():
+      dx, dy = gradients.gradients(outputs, [x, y])
+      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index fbe6b62..4033d5f 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Basic arithmetic operators.
 
-See the @{$python/math_ops} guide.
+See the [python/math_ops](python/math_ops) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -1038,6 +1038,31 @@
   return _div_python2(x, y, name)
 
 
+def unsafe_div(x, y, name=None):
+  """Computes an unsafe divide which returns 0 if the y is zero.
+
+  Note that the function uses Python 3 division operator semantics.
+
+  Args:
+    x: A `Tensor`. Must be one of the following types:
+       `float32`, `float64`, `int16`, `int32`, `int64`.
+    y: A `Tensor` whose dtype is compatible with `x`.
+    name: A name for the operation (optional).
+  Returns:
+    The element-wise value of the x divided by y.
+  """
+
+  with ops.name_scope(name, "unsafe_div", [x, y]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
+    x_dtype = x.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(
+          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
+    return gen_math_ops.unsafe_div(x, y, name=name)
+
+
 # TODO(aselle): This should be removed
 mod = gen_math_ops.floor_mod
 
@@ -2105,7 +2130,8 @@
   """Adds all input tensors element-wise.
 
   Args:
-    inputs: A list of `Tensor` objects, each with same shape and type.
+    inputs: A list of `Tensor` or `IndexedSlices` objects, each with same shape
+      and type.
     name: A name for the operation (optional).
 
   Returns:
@@ -2116,17 +2142,21 @@
     cannot be inferred.
   """
   if not inputs or not isinstance(inputs, (list, tuple)):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+    raise ValueError("inputs must be a list of at least one"
+                     "Tensor/IndexedSlices with the same dtype and shape")
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
-  if not all(isinstance(x, ops.Tensor) for x in inputs):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+  if not all(isinstance(x, (ops.Tensor, ops.IndexedSlices)) for x in inputs):
+    raise ValueError("inputs must be a list of at least one"
+                     "Tensor/IndexedSlices with the same dtype and shape")
 
   if len(inputs) == 1:
+    if isinstance(inputs[0], ops.IndexedSlices):
+      values = inputs[0].values
+    else:
+      values = inputs[0]
     if name:
-      return array_ops.identity(inputs[0], name=name)
-    return inputs[0]
+      return array_ops.identity(values, name=name)
+    return values
   return gen_math_ops.add_n(inputs, name=name)
 
 
@@ -2534,8 +2564,9 @@
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r""" Computes the mean along segments of a tensor.
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
   [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
@@ -2566,8 +2597,9 @@
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
   [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
@@ -2602,8 +2634,9 @@
                        num_segments=None):
   r"""Computes the sum along sparse segments of a tensor.
 
-  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
   dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -2677,8 +2710,9 @@
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
-  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
   dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 6b709e5..5fe7bbc 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -473,5 +473,19 @@
       self.assertAllEqual(tf_result, expanded_nums)
 
 
+class UnsafeDivTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    nums = np.arange(-10, 10, .25).reshape(80, 1)
+    divs = np.arange(-3, 3, .25).reshape(1, 24)
+
+    np_result = np.true_divide(nums, divs)
+    np_result[:, divs[0] == 0] = 0
+
+    with self.test_session():
+      tf_result = math_ops.unsafe_div(nums, divs).eval()
+      self.assertAllEqual(tf_result, np_result)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 3aedeb6..763877c 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -34,7 +34,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -57,7 +57,8 @@
       Furthermore, the final answer should be computed once instead of
       in every replica/tower. Both of these are accomplished by
       running the computation of the final result value inside
-      `tf.contrib.distribute.get_tower_context().merge_call(fn)`.
+      `tf.contrib.distribution_strategy_context.get_tower_context(
+      ).merge_call(fn)`.
       Inside the `merge_call()`, ops are only added to the graph once
       and access to a tower-local variable in a computation returns
       the sum across all replicas/towers.
@@ -300,6 +301,40 @@
   return total_cm, update_op
 
 
+def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
+  """Aggregate metric value across towers."""
+  def fn(distribution, *a):
+    """Call `metric_value_fn` in the correct control flow context."""
+    if hasattr(distribution, '_outer_control_flow_context'):
+      # If there was an outer context captured before this method was called,
+      # then we enter that context to create the metric value op. If the
+      # caputred context is `None`, ops.control_dependencies(None) gives the
+      # desired behavior. Else we use `Enter` and `Exit` to enter and exit the
+      # captured context.
+      # This special handling is needed because sometimes the metric is created
+      # inside a while_loop (and perhaps a TPU rewrite context). But we don't
+      # want the value op to be evaluated every step or on the TPU. So we
+      # create it outside so that it can be evaluated at the end on the host,
+      # once the update ops have been evaluted.
+
+      # pylint: disable=protected-access
+      if distribution._outer_control_flow_context is None:
+        with ops.control_dependencies(None):
+          metric_value = metric_value_fn(distribution, *a)
+      else:
+        distribution._outer_control_flow_context.Enter()
+        metric_value = metric_value_fn(distribution, *a)
+        distribution._outer_control_flow_context.Exit()
+        # pylint: enable=protected-access
+    else:
+      metric_value = metric_value_fn(distribution, *a)
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, metric_value)
+    return metric_value
+
+  return distribution_strategy_context.get_tower_context().merge_call(fn, *args)
+
+
 @tf_export('metrics.mean')
 def mean(values,
          weights=None,
@@ -367,14 +402,10 @@
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    def aggregate_across_towers(_, t, c):
-      mean_t = _safe_div(t, c, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_t)
-      return mean_t
+    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
 
-    mean_t = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total, count)
+    mean_t = _aggregate_across_towers(
+        metrics_collections, compute_mean, total, count)
     update_op = _safe_div(update_total_op, update_count_op, 'update_op')
 
     if updates_collections:
@@ -611,14 +642,8 @@
 
 
 def _aggregate_variable(v, collections):
-
-  def f(distribution, value):
-    value = distribution.read_var(value)
-    if collections:
-      ops.add_to_collections(collections, value)
-    return value
-
-  return distribute_lib.get_tower_context().merge_call(f, v)
+  f = lambda distribution, value: distribution.read_var(value)
+  return _aggregate_across_towers(collections, f, v)
 
 
 @tf_export('metrics.auc')
@@ -806,15 +831,12 @@
         raise ValueError('Invalid summation_method: %s' % summation_method)
 
     # sum up the areas of all the trapeziums
-    def aggregate_auc(_, values):
-      auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
-                              values['fp'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, auc_value)
-      return auc_value
+    def compute_auc_value(_, values):
+      return compute_auc(values['tp'], values['fn'], values['tn'], values['fp'],
+                         'value')
 
-    auc_value = distribute_lib.get_tower_context().merge_call(
-        aggregate_auc, values)
+    auc_value = _aggregate_across_towers(
+        metrics_collections, compute_auc_value, values)
     update_op = compute_auc(update_ops['tp'], update_ops['fn'],
                             update_ops['tn'], update_ops['fp'], 'update_op')
 
@@ -1045,16 +1067,14 @@
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
-    def aggregate_mean_accuracy(_, count, total):
+    def compute_mean_accuracy(_, count, total):
       per_class_accuracy = _safe_div(count, total, None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_accuracy_v)
       return mean_accuracy_v
 
-    mean_accuracy_v = distribute_lib.get_tower_context().merge_call(
-        aggregate_mean_accuracy, count, total)
+    mean_accuracy_v = _aggregate_across_towers(
+        metrics_collections, compute_mean_accuracy, count, total)
 
     update_op = _safe_div(update_count_op, update_total_op, name='update_op')
     if updates_collections:
@@ -1127,7 +1147,7 @@
     total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                       num_classes, weights)
 
-    def compute_mean_iou(total_cm, name):
+    def compute_mean_iou(_, total_cm):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
       sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
@@ -1151,17 +1171,12 @@
       # If the number of valid entries is 0 (no classes) we return 0.
       result = array_ops.where(
           math_ops.greater(num_valid_entries, 0),
-          math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
+          math_ops.reduce_sum(iou, name='mean_iou') / num_valid_entries, 0)
       return result
 
-    def mean_iou_across_towers(_, v):
-      mean_iou_v = compute_mean_iou(v, 'mean_iou')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_iou_v)
-      return mean_iou_v
-
-    mean_iou_v = distribute_lib.get_tower_context().merge_call(
-        mean_iou_across_towers, total_cm)
+    # TODO(priyag): Use outside_compilation if in TPU context.
+    mean_iou_v = _aggregate_across_towers(
+        metrics_collections, compute_mean_iou, total_cm)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -1370,14 +1385,10 @@
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    def aggregate_across_towers(_, t, c):
-      mean_t = _safe_div(t, c, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_t)
-      return mean_t
+    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
 
-    mean_t = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total, count)
+    mean_t = _aggregate_across_towers(
+        metrics_collections, compute_mean, total, count)
 
     update_op = _safe_div(update_total_op, update_count_op, 'update_op')
     if updates_collections:
@@ -2003,13 +2014,10 @@
           math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
     def once_across_towers(_, true_p, false_p):
-      p = compute_precision(true_p, false_p, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, p)
-      return p
+      return compute_precision(true_p, false_p, 'value')
 
-    p = distribute_lib.get_tower_context().merge_call(
-        once_across_towers, true_p, false_p)
+    p = _aggregate_across_towers(metrics_collections, once_across_towers,
+                                 true_p, false_p)
 
     update_op = compute_precision(true_positives_update_op,
                                   false_positives_update_op, 'update_op')
@@ -2087,13 +2095,10 @@
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
     def precision_across_towers(_, values):
-      prec = compute_precision(values['tp'], values['fp'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, prec)
-      return prec
+      return compute_precision(values['tp'], values['fp'], 'value')
 
-    prec = distribute_lib.get_tower_context().merge_call(
-        precision_across_towers, values)
+    prec = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, values)
 
     update_op = compute_precision(update_ops['tp'], update_ops['fp'],
                                   'update_op')
@@ -2183,13 +2188,10 @@
           math_ops.div(true_p, true_p + false_n), 0, name)
 
     def once_across_towers(_, true_p, false_n):
-      rec = compute_recall(true_p, false_n, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, rec)
-      return rec
+      return compute_recall(true_p, false_n, 'value')
 
-    rec = distribute_lib.get_tower_context().merge_call(
-        once_across_towers, true_p, false_n)
+    rec = _aggregate_across_towers(
+        metrics_collections, once_across_towers, true_p, false_n)
 
     update_op = compute_recall(true_positives_update_op,
                                false_negatives_update_op, 'update_op')
@@ -2621,14 +2623,11 @@
         class_id=class_id,
         weights=weights)
 
-    def aggregate_across_towers(_, tp, fn):
-      metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, metric)
-      return metric
+    def compute_recall(_, tp, fn):
+      return math_ops.div(tp, math_ops.add(tp, fn), name=scope)
 
-    metric = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, tp, fn)
+    metric = _aggregate_across_towers(
+        metrics_collections, compute_recall, tp, fn)
 
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fn_update), name='update')
@@ -2703,13 +2702,10 @@
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
     def recall_across_towers(_, values):
-      rec = compute_recall(values['tp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, rec)
-      return rec
+      return compute_recall(values['tp'], values['fn'], 'value')
 
-    rec = distribute_lib.get_tower_context().merge_call(
-        recall_across_towers, values)
+    rec = _aggregate_across_towers(
+        metrics_collections, recall_across_towers, values)
 
     update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
     if updates_collections:
@@ -2777,14 +2773,9 @@
   mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
                                           None, name or
                                           'root_mean_squared_error')
-  def once_across_towers(_, mse):
-    rmse = math_ops.sqrt(mse)
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rmse)
-    return rmse
 
-  rmse = distribute_lib.get_tower_context().merge_call(
-      once_across_towers, mse)
+  once_across_towers = lambda _, mse: math_ops.sqrt(mse)
+  rmse = _aggregate_across_towers(metrics_collections, once_across_towers, mse)
 
   update_rmse_op = math_ops.sqrt(update_mse_op)
   if updates_collections:
@@ -2879,15 +2870,12 @@
       return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
-    def aggregate_across_towers(_, values):
-      sensitivity = compute_sensitivity_at_specificity(
+    def sensitivity_across_towers(_, values):
+      return compute_sensitivity_at_specificity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, sensitivity)
-      return sensitivity
 
-    sensitivity = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, values)
+    sensitivity = _aggregate_across_towers(
+        metrics_collections, sensitivity_across_towers, values)
 
     update_op = compute_sensitivity_at_specificity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
@@ -3156,14 +3144,11 @@
       total_update = state_ops.assign_add(total_var, batch_total, name='update')
 
     # Divide total by max to get mean, for both vars and the update ops.
-    def aggregate_across_towers(_, total_var, max_var):
-      mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_average_precision)
-      return mean_average_precision
+    def precision_across_towers(_, total_var, max_var):
+      return _safe_scalar_div(total_var, max_var, name='mean')
 
-    mean_average_precision = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total_var, max_var)
+    mean_average_precision = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, total_var, max_var)
 
     update = _safe_scalar_div(total_update, max_update, name=scope)
     if updates_collections:
@@ -3442,14 +3427,11 @@
         class_id=class_id,
         weights=weights)
 
-    def aggregate_across_towers(_, tp, fp):
-      metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, metric)
-      return metric
+    def precision_across_towers(_, tp, fp):
+      return math_ops.div(tp, math_ops.add(tp, fp), name=scope)
 
-    metric = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, tp, fp)
+    metric = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, tp, fp)
 
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fp_update), name='update')
@@ -3680,15 +3662,12 @@
       return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
-    def aggregate_across_towers(_, values):
-      specificity = compute_specificity_at_sensitivity(
+    def specificity_across_towers(_, values):
+      return compute_specificity_at_sensitivity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, specificity)
-      return specificity
 
-    specificity = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, values)
+    specificity = _aggregate_across_towers(
+        metrics_collections, specificity_across_towers, values)
 
     update_op = compute_specificity_at_sensitivity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 3396841..4b73fc8 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -16,7 +16,7 @@
 # pylint: disable=unused-import,g-bad-import-order
 """Neural network support.
 
-See the @{$python/nn} guide.
+See the [Neural network](https://tensorflow.org/api_guides/python/nn) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index f47f38e..51f812b 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -425,7 +425,7 @@
     strides: 1-D of size 4.  The stride of the sliding window for each
       dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
@@ -507,7 +507,7 @@
     strides: 1-D of size 4.  The strides for the depthwise convolution for
       each dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
@@ -1189,7 +1189,7 @@
   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
-  @{tf.nn.log_uniform_candidate_sampler}.
+  `tf.nn.log_uniform_candidate_sampler`.
 
   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 5cdb772..edc6e04 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -698,7 +698,7 @@
   `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
-  @{$python/nn#Convolution$comment here}.
+  [comment here](https://tensorflow.org/api_guides/python/nn#Convolution).
 
   In the case that `data_format` does start with `"NC"`, the `input` and output
   (but not the `filter`) are simply transposed as follows:
@@ -898,8 +898,8 @@
   ```
 
   where the reduction function REDUCE depends on the value of `pooling_type`,
-  and pad_before is defined based on the value of `padding` as described in the
-  @{tf.nn.convolution$comment here}.
+  and pad_before is defined based on the value of `padding` as described in
+  the "returns" section of `tf.nn.convolution` for details.
   The reduction never includes out-of-bounds positions.
 
   In the case that `data_format` starts with `"NC"`, the `input` and output are
@@ -921,7 +921,7 @@
     window_shape: Sequence of N ints >= 1.
     pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
     padding: The padding algorithm, must be "SAME" or "VALID".
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     dilation_rate: Optional.  Dilation rate.  List of N ints >= 1.
       Defaults to [1]*N.  If any value of dilation_rate is > 1, then all values
       of strides must be 1.
@@ -1045,8 +1045,8 @@
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
-  @{tf.nn.convolution}, and exists only for backwards compatibility. You can
-  use @{tf.nn.convolution} to perform 1-D, 2-D, or 3-D atrous convolution.
+  `tf.nn.convolution`, and exists only for backwards compatibility. You can
+  use `tf.nn.convolution` to perform 1-D, 2-D, or 3-D atrous convolution.
 
 
   Computes a 2-D atrous convolution, also known as convolution with holes or
@@ -1205,7 +1205,7 @@
     strides: A list of ints. The stride of the sliding window for each
       dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
 
@@ -1430,7 +1430,7 @@
     strides: A list of ints. The stride of the sliding window for each
       dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string, either `'NDHWC'` or `'NCDHW`' specifying the layout
       of the input and output tensors. Defaults to `'NDHWC'`.
     name: Optional name for the returned tensor.
@@ -1819,7 +1819,7 @@
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through @{tf.stop_gradient}
+  backpropagation into `labels`, pass label tensors through `tf.stop_gradient`
   before feeding it to this function.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
@@ -1836,8 +1836,9 @@
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` of the same shape as `labels` and of the same type as `logits`
-    with the softmax cross entropy loss.
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1909,7 +1910,7 @@
 Future major versions of TensorFlow will allow gradients to flow
 into the labels input on backprop by default.
 
-See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
+See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 """
 
 
@@ -1946,7 +1947,7 @@
 
   Backpropagation will happen only into `logits`.  To calculate a cross entropy
   loss that allows backpropagation into both `logits` and `labels`, see
-  @{tf.nn.softmax_cross_entropy_with_logits_v2}.
+  `tf.nn.softmax_cross_entropy_with_logits_v2`.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
@@ -1962,8 +1963,9 @@
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` of the same shape as `labels` and of the same type as `logits`
-    with the softmax cross entropy loss.
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -2003,8 +2005,8 @@
   A common use case is to have logits and labels of shape
   `[batch_size, num_classes]`, but higher dimensions are supported, in which
   case the `dim`-th dimension is assumed to be of size `num_classes`.
-  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
-  or `float64`).
+  `logits` must have the dtype of `float16`, `float32`, or `float64`, and
+  `labels` must have the dtype of `int32` or `int64`.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
@@ -2114,7 +2116,7 @@
     strides: A list or tuple of 4 ints. The stride of the sliding window for
       each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the operation.
 
@@ -2143,7 +2145,7 @@
     strides: A list or tuple of 4 ints. The stride of the sliding window for
       each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
 
@@ -2301,7 +2303,7 @@
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for this operation (optional).
 
@@ -2521,7 +2523,7 @@
     stride: An `integer`.  The number of entries by which
       the filter is moved right at each step.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the @{tf.nn.convolution$comment here}
+      See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 4cd357d..ce0db6b 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -220,7 +220,7 @@
       output = nn_ops.l2_loss(x)
       err = gradient_checker.compute_gradient_error(x, x_shape, output, [1])
     print("L2Loss gradient err = %g " % err)
-    err_tolerance = 1e-11
+    err_tolerance = 1e-10
     self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index d348e47..8fcbd7d 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -56,8 +56,8 @@
   `check_numerics` op for all of its (`half`, `float`, or `double`) inputs
   is guaranteed to run before the `check_numerics` op on any of its outputs.
 
-  Note: This API is not compatible with the use of @{tf.cond} or
-  @{tf.while_loop}, and will raise a `ValueError` if you attempt to call it
+  Note: This API is not compatible with the use of `tf.cond` or
+  `tf.while_loop`, and will raise a `ValueError` if you attempt to call it
   in such a graph.
 
   Returns:
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 6c804a5..015181a 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -85,6 +85,7 @@
 
 cuda_py_test(
     name = "control_flow_ops_test",
+    size = "large",
     srcs = ["control_flow_ops_test.py"],
     additional_deps = [
         ":control_flow_ops",
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 77ec3bc..2e4b2fd 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2117,7 +2117,7 @@
 # 2a Elements written to the array are "stacked"
 # To simulate multiple TensorArrays, we may increase the dimension of each
 # element of the array. i.e. the i_th row of the j_th entry of the converted
-# TensorArray corresponds to to the j_th entry of the TensorArray in the i_th
+# TensorArray corresponds to the j_th entry of the TensorArray in the i_th
 # pfor iteration.
 #
 # 2b Elements written to the array are "unstacked"
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index b8738ad..4baf506 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -61,7 +61,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -110,7 +110,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -158,7 +158,7 @@
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -212,7 +212,7 @@
     dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
       or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See @{tf.set_random_seed}
+      See `tf.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -264,7 +264,7 @@
     value: A Tensor to be shuffled.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for the operation (optional).
 
@@ -292,7 +292,7 @@
     value: Input tensor to crop.
     size: 1-D tensor with size the rank of `value`.
     seed: Python integer. Used to create a random seed. See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: A name for this operation (optional).
 
@@ -338,7 +338,7 @@
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
@@ -417,7 +417,7 @@
       `float64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: Optional name for the operation.
 
@@ -467,7 +467,7 @@
       `int64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
-      @{tf.set_random_seed}
+      `tf.set_random_seed`
       for behavior.
     name: Optional name for the operation.
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index d533731..3d0205f 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -94,26 +94,8 @@
         ops.set_shape_and_handle_data_for_outputs(h.op)
       handle._handle_data = h._handle_data
     # pylint: enable=protected-access
-
-  # Clean up our reference cycles to avoid making the garbage collector run.
-  # pylint: disable=protected-access
-  # OrderedDict, constructed on Graph creation, makes a simple reference loop
-  # and hides it in an __attribute in some Python versions. We don't need to
-  # throw an error if we can't find it, but if we do find it we can break the
-  # loop to avoid creating work for the garbage collector.
-  problematic_cycle = graph._functions.__dict__.get("_OrderedDict__root", None)
-  # pylint: enable=protected-access
-  if problematic_cycle:
-    try:
-      del problematic_cycle[0][:]
-    except TypeError:
-      # This is probably not one of the problematic Python versions. Continue
-      # with the rest of our cleanup.
-      pass
-  # Now clean up our own reference cycles by clearing all of the attributes for
-  # the Graph and op we created.
-  h.__dict__ = {}
-  graph.__dict__ = {}
+  # Clean up op->graph->op reference cycles.
+  ops.dismantle_graph(graph)
   return handle
 
 
@@ -185,7 +167,8 @@
 class ResourceVariable(variables.RefVariable):
   """Variable based on resource handles.
 
-  See the @{$variables$Variables How To} for a high level overview.
+  See the [Variables How To](https://tensorflow.org/guide/variables)
+  for a high level overview.
 
   A `ResourceVariable` allows you to maintain state across subsequent calls to
   session.run.
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 8356fbb..85a6a22 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -34,6 +34,9 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -190,6 +193,13 @@
   for each `s` in `self.batch_size`.
   """
 
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+    super(RNNCell, self).__init__(
+        trainable=trainable, name=name, dtype=dtype, **kwargs)
+    # Attribute that indicates whether the cell is a TF RNN cell, due the slight
+    # difference between TF and Keras RNN cell.
+    self._is_tf_rnn_cell = True
+
   def __call__(self, inputs, state, scope=None):
     """Run this RNN cell on inputs, starting from the given state.
 
@@ -336,7 +346,8 @@
 
   Args:
     num_units: int, The number of units in the RNN cell.
-    activation: Nonlinearity to use.  Default: `tanh`.
+    activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+      that is within Keras activation function names.
     reuse: (optional) Python boolean describing whether to reuse variables
      in an existing scope.  If not `True`, and the existing scope already has
      the given variables, an error is raised.
@@ -345,6 +356,8 @@
       cases.
     dtype: Default dtype of the layer (default of `None` means use the type
       of the first input). Required when `build` is called before `call`.
+    **kwargs: Dict, keyword named properties for common layer attributes, like
+      `trainable` etc when constructing the cell from configs of get_config().
   """
 
   def __init__(self,
@@ -352,14 +365,19 @@
                activation=None,
                reuse=None,
                name=None,
-               dtype=None):
-    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
+               dtype=None,
+               **kwargs):
+    super(BasicRNNCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
 
     self._num_units = num_units
-    self._activation = activation or math_ops.tanh
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
 
   @property
   def state_size(self):
@@ -369,12 +387,13 @@
   def output_size(self):
     return self._num_units
 
+  @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape[-1]
     self._kernel = self.add_variable(
         _WEIGHTS_VARIABLE_NAME,
         shape=[input_depth + self._num_units, self._num_units])
@@ -394,6 +413,15 @@
     output = self._activation(gate_inputs)
     return output, output
 
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(BasicRNNCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export("nn.rnn_cell.GRUCell")
 class GRUCell(LayerRNNCell):
@@ -413,6 +441,8 @@
       cases.
     dtype: Default dtype of the layer (default of `None` means use the type
       of the first input). Required when `build` is called before `call`.
+    **kwargs: Dict, keyword named properties for common layer attributes, like
+      `trainable` etc when constructing the cell from configs of get_config().
   """
 
   def __init__(self,
@@ -422,16 +452,21 @@
                kernel_initializer=None,
                bias_initializer=None,
                name=None,
-               dtype=None):
-    super(GRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
+               dtype=None,
+               **kwargs):
+    super(GRUCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
     self.input_spec = base_layer.InputSpec(ndim=2)
 
     self._num_units = num_units
-    self._activation = activation or math_ops.tanh
-    self._kernel_initializer = kernel_initializer
-    self._bias_initializer = bias_initializer
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
+    self._kernel_initializer = initializers.get(kernel_initializer)
+    self._bias_initializer = initializers.get(bias_initializer)
 
   @property
   def state_size(self):
@@ -441,12 +476,13 @@
   def output_size(self):
     return self._num_units
 
+  @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape[-1]
     self._gate_kernel = self.add_variable(
         "gates/%s" % _WEIGHTS_VARIABLE_NAME,
         shape=[input_depth + self._num_units, 2 * self._num_units],
@@ -492,6 +528,17 @@
     new_h = u * state + (1 - u) * c
     return new_h, new_h
 
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "kernel_initializer": initializers.serialize(self._kernel_initializer),
+        "bias_initializer": initializers.serialize(self._bias_initializer),
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(GRUCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 _LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
 
@@ -531,7 +578,7 @@
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
 
-  For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
+  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
   that follows.
   """
 
@@ -546,7 +593,8 @@
                activation=None,
                reuse=None,
                name=None,
-               dtype=None):
+               dtype=None,
+               **kwargs):
     """Initialize the basic LSTM cell.
 
     Args:
@@ -557,7 +605,8 @@
       state_is_tuple: If True, accepted and returned states are 2-tuples of
         the `c_state` and `m_state`.  If False, they are concatenated
         along the column axis.  The latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.  Default: `tanh`.
+      activation: Activation function of the inner states.  Default: `tanh`. It
+        could also be string that is within Keras activation function names.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
@@ -566,11 +615,14 @@
         cases.
       dtype: Default dtype of the layer (default of `None` means use the type
         of the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
 
       When restoring from CudnnLSTM-trained checkpoints, must use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(BasicLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
+    super(BasicLSTMCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -581,7 +633,10 @@
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
-    self._activation = activation or math_ops.tanh
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
 
   @property
   def state_size(self):
@@ -592,12 +647,13 @@
   def output_size(self):
     return self._num_units
 
+  @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape[-1]
     h_depth = self._num_units
     self._kernel = self.add_variable(
         _WEIGHTS_VARIABLE_NAME,
@@ -655,6 +711,17 @@
       new_state = array_ops.concat([new_c, new_h], 1)
     return new_h, new_state
 
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "forget_bias": self._forget_bias,
+        "state_is_tuple": self._state_is_tuple,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(BasicLSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export("nn.rnn_cell.LSTMCell")
 class LSTMCell(LayerRNNCell):
@@ -684,7 +751,7 @@
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=None, num_proj_shards=None,
                forget_bias=1.0, state_is_tuple=True,
-               activation=None, reuse=None, name=None, dtype=None):
+               activation=None, reuse=None, name=None, dtype=None, **kwargs):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -710,7 +777,8 @@
       state_is_tuple: If True, accepted and returned states are 2-tuples of
         the `c_state` and `m_state`.  If False, they are concatenated
         along the column axis.  This latter behavior will soon be deprecated.
-      activation: Activation function of the inner states.  Default: `tanh`.
+      activation: Activation function of the inner states.  Default: `tanh`. It
+        could also be string that is within Keras activation function names.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
@@ -719,11 +787,14 @@
         cases.
       dtype: Default dtype of the layer (default of `None` means use the type
         of the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
 
       When restoring from CudnnLSTM-trained checkpoints, use
       `CudnnCompatibleLSTMCell` instead.
     """
-    super(LSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
+    super(LSTMCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -739,14 +810,17 @@
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._cell_clip = cell_clip
-    self._initializer = initializer
+    self._initializer = initializers.get(initializer)
     self._num_proj = num_proj
     self._proj_clip = proj_clip
     self._num_unit_shards = num_unit_shards
     self._num_proj_shards = num_proj_shards
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
-    self._activation = activation or math_ops.tanh
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
 
     if num_proj:
       self._state_size = (
@@ -767,12 +841,13 @@
   def output_size(self):
     return self._output_size
 
+  @tf_utils.shape_type_conversion
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape[-1]
     h_depth = self._num_units if self._num_proj is None else self._num_proj
     maybe_partitioner = (
         partitioned_variables.fixed_size_partitioner(self._num_unit_shards)
@@ -886,6 +961,24 @@
                  array_ops.concat([c, m], 1))
     return m, new_state
 
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "use_peepholes": self._use_peepholes,
+        "cell_clip": self._cell_clip,
+        "initializer": initializers.serialize(self._initializer),
+        "num_proj": self._num_proj,
+        "proj_clip": self._proj_clip,
+        "num_unit_shards": self._num_unit_shards,
+        "num_proj_shards": self._num_proj_shards,
+        "forget_bias": self._forget_bias,
+        "state_is_tuple": self._state_is_tuple,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(LSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 def _enumerated_map_structure_up_to(shallow_structure, map_fn, *args, **kwargs):
   ix = [0]
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index af103d3..8d66de6 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Script Language Operators. See the @{$python/script_ops} guide."""
+"""Script Language Operators."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -313,8 +313,8 @@
   in a once-differentiable TensorFlow operation that executes it with eager
   exeuction enabled. As a consequence, `tf.contrib.eager.py_func` makes it
   possible to express control flow using Python constructs (`if`, `while`,
-  `for`, etc.), instead of TensorFlow control flow constructs (@{tf.cond},
-  @{tf.while_loop}). For example, you might use `tf.contrib.eager.py_func` to
+  `for`, etc.), instead of TensorFlow control flow constructs (`tf.cond`,
+  `tf.while_loop`). For example, you might use `tf.contrib.eager.py_func` to
   implement the log huber function:
 
   ```python
@@ -343,17 +343,18 @@
   or print statements as desired, and wrap those functions in
   `tf.contrib.eager.py_func`.
 
-  For more information on eager execution, see @{$guide/eager}.
+  For more information on eager execution, see the
+  [Eager guide](https://tensorflow.org/guide/eager).
 
-  `tf.contrib.eager.py_func` is similar in spirit to @{tf.py_func}, but unlike
+  `tf.contrib.eager.py_func` is similar in spirit to `tf.py_func`, but unlike
   the latter, the former lets you use TensorFlow operations in the wrapped
-  Python function. In particular, while @{tf.py_func} only runs on CPUs and
+  Python function. In particular, while `tf.py_func` only runs on CPUs and
   wraps functions that take NumPy arrays as inputs and return NumPy arrays as
   outputs, `tf.contrib.eager.py_func` can be placed on GPUs and wraps functions
   that take Tensors as inputs, execute TensorFlow operations in their bodies,
   and return Tensors as outputs.
 
-  Like @{tf.py_func}, `tf.contrib.eager.py_func` has the following limitations
+  Like `tf.py_func`, `tf.contrib.eager.py_func` has the following limitations
   with respect to serialization and distribution:
 
   * The body of the function (i.e. `func`) will not be serialized in a
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index dee84ba..e229501 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations. See the @{$python/session_ops} guide."""
+"""Tensor Handle Operations.
+
+See the [Session Ops](https://tensorflow.org/api_guides/python/session_ops)
+guide.
+"""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c3b16a7..e91813b 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -14,7 +14,10 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide."""
+"""Sparse Tensor Representation.
+
+See the [Sparse Ops](https://tensorflow.org/api_guides/python/sparse_ops) guide.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -777,8 +780,10 @@
 
 
 @tf_export("sparse_reduce_max")
-def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
-                      reduction_axes=None):
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def sparse_reduce_max(sp_input, axis=None, keepdims=None,
+                      reduction_axes=None, keep_dims=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
   This Op takes a SparseTensor and is the sparse counterpart to
@@ -786,14 +791,19 @@
   instead of a sparse one.
 
   Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained
   with length 1.
 
   If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
   with a single element is returned.  Additionally, the axes can be negative,
   similar to the indexing rules in Python.
 
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `reduction_axes`. But, in case there are no values in
+  `reduction_axes`, it will reduce to 0. See second example below.
+
   For example:
 
   ```python
@@ -803,30 +813,44 @@
   tf.sparse_reduce_max(x) ==> 3
   tf.sparse_reduce_max(x, 0) ==> [1, 3, 2]
   tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
-  tf.sparse_reduce_max(x, 1, keep_dims=True) ==> [[2], [3]]
+  tf.sparse_reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
   tf.sparse_reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse_reduce_max(x, 1) ==> [-7, 4, 0]
   ```
 
   Args:
     sp_input: The SparseTensor to reduce. Should have numeric type.
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
+    keepdims: If true, retain reduced dimensions with length 1.
     reduction_axes: Deprecated name of axis.
+    keep_dims:  Deprecated alias for `keepdims`.
 
   Returns:
     The reduced Tensor.
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
   return gen_sparse_ops.sparse_reduce_max(
       sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
 @tf_export("sparse_reduce_max_sparse")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_max_sparse(sp_input,
                              axis=None,
-                             keep_dims=False,
-                             reduction_axes=None):
+                             keepdims=None,
+                             reduction_axes=None,
+                             keep_dims=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
   This Op takes a SparseTensor and is the sparse counterpart to
@@ -834,8 +858,8 @@
   SparseTensor.
 
   Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained
   with length 1.
 
   If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
@@ -846,23 +870,31 @@
     sp_input: The SparseTensor to reduce. Should have numeric type.
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis
+    keepdims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced SparseTensor.
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
           sp_input.indices, sp_input.values, sp_input.dense_shape,
-          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
 @tf_export("sparse_reduce_sum")
-def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
-                      reduction_axes=None):
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
+                      reduction_axes=None, keep_dims=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
   This Op takes a SparseTensor and is the sparse counterpart to
@@ -870,8 +902,8 @@
   instead of a sparse one.
 
   Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained
   with length 1.
 
   If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
@@ -887,7 +919,7 @@
   tf.sparse_reduce_sum(x) ==> 3
   tf.sparse_reduce_sum(x, 0) ==> [1, 1, 1]
   tf.sparse_reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
-  tf.sparse_reduce_sum(x, 1, keep_dims=True) ==> [[2], [1]]
+  tf.sparse_reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
   tf.sparse_reduce_sum(x, [0, 1]) ==> 3
   ```
 
@@ -895,22 +927,31 @@
     sp_input: The SparseTensor to reduce. Should have numeric type.
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
+    keepdims: If true, retain reduced dimensions with length 1.
     reduction_axes: Deprecated name of axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced Tensor.
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
   return gen_sparse_ops.sparse_reduce_sum(
       sp_input.indices, sp_input.values, sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
 @tf_export("sparse_reduce_sum_sparse")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def sparse_reduce_sum_sparse(sp_input,
                              axis=None,
-                             keep_dims=False,
-                             reduction_axes=None):
+                             keepdims=None,
+                             reduction_axes=None,
+                             keep_dims=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
   This Op takes a SparseTensor and is the sparse counterpart to
@@ -918,8 +959,8 @@
   SparseTensor.
 
   Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keepdims` is true, the reduced dimensions are retained
   with length 1.
 
   If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
@@ -930,16 +971,22 @@
     sp_input: The SparseTensor to reduce. Should have numeric type.
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
-    keep_dims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis
+    keepdims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced SparseTensor.
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
           sp_input.indices, sp_input.values, sp_input.dense_shape,
-          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
index 293aace..da5884e 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -180,9 +180,9 @@
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
   Currently only Types II and III are supported. Type II is implemented using a
-  length `2N` padded @{tf.spectral.rfft}, as described here:
+  length `2N` padded `tf.spectral.rfft`, as described here:
   https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
-  inverse of Type II (i.e. using a length `2N` padded @{tf.spectral.irfft}).
+  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).
 
   @compatibility(scipy)
   Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 2c93cf7..125e6c8 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Variables. See the @{$python/state_ops} guide."""
+"""Variables.
+
+See the [Variables](https://tensorflow.org/api_guides/python/state_ops) guide.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -329,7 +332,7 @@
 
       [1, 11, 3, 10, 9, 6, 7, 12]
 
-  See @{tf.scatter_nd} for more details about how to make updates to
+  See `tf.scatter_nd` for more details about how to make updates to
   slices.
 
   Args:
@@ -443,7 +446,7 @@
 
       [1, 13, 3, 14, 14, 6, 7, 20]
 
-  See @{tf.scatter_nd} for more details about how to make updates to
+  See `tf.scatter_nd` for more details about how to make updates to
   slices.
 
   Args:
@@ -470,3 +473,57 @@
   return ref._lazy_read(gen_state_ops.resource_scatter_nd_add(  # pylint: disable=protected-access
       ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
       name=name))
+
+
+@tf_export("scatter_sub")
+def scatter_sub(ref, indices, updates, use_locking=False, name=None):
+  r"""Subtracts sparse updates to a variable reference.
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] -= updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] -= updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the reset value.
+
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their (negated) contributions add.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]` or
+  `updates.shape = []`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%"
+       src="https://www.tensorflow.org/images/ScatterSub.png" alt>
+  </div>
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`,
+      `qint8`, `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`,
+      `uint32`, `uint64`. Should be from a `Variable` node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to subtract from `ref`.
+    use_locking: An optional `bool`. Defaults to `False`.
+      If True, the subtraction will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_sub(ref, indices, updates,
+                                     use_locking=use_locking, name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_sub(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0280c89..67ae2e6 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -15,7 +15,7 @@
 
 """Operations for working with string Tensors.
 
-See the @{$python/string_ops} guide.
+See the [Strings](https://tensorflow.org/api_guides/python/string_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index a793f63..b382c3b 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -23,7 +23,7 @@
 
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import distribute
+from tensorflow.python.training import distribution_strategy_context
 
 
 def collect(val, collections, default_collections):
@@ -49,7 +49,7 @@
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last tower,
   # compute sum or mean across towers).
-  tower_context = distribute.get_tower_context()
+  tower_context = distribution_strategy_context.get_tower_context()
   return tower_context and tower_context.tower_id > 0
 
 
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 00150fe..94c7d88 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -110,8 +110,8 @@
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - @{tf.contrib.summary.create_file_writer}
-  - @{tf.contrib.summary.create_db_writer}
+  - `tf.contrib.summary.create_file_writer`
+  - `tf.contrib.summary.create_db_writer`
   """
 
   def  __init__(self, resource, init_op_fn):
@@ -174,22 +174,22 @@
   """Initializes summary writing for graph execution mode.
 
   This helper method provides a higher-level alternative to using
-  @{tf.contrib.summary.summary_writer_initializer_op} and
-  @{tf.contrib.summary.graph}.
+  `tf.contrib.summary.summary_writer_initializer_op` and
+  `tf.contrib.summary.graph`.
 
-  Most users will also want to call @{tf.train.create_global_step}
+  Most users will also want to call `tf.train.create_global_step`
   which can happen before or after this function is called.
 
   Args:
-    graph: A @{tf.Graph} or @{tf.GraphDef} to output to the writer.
+    graph: A `tf.Graph` or `tf.GraphDef` to output to the writer.
       This function will not write the default graph by default. When
       writing to an event log file, the associated step will be zero.
-    session: So this method can call @{tf.Session.run}. This defaults
-      to @{tf.get_default_session}.
+    session: So this method can call `tf.Session.run`. This defaults
+      to `tf.get_default_session`.
 
   Raises:
     RuntimeError: If  the current thread has no default
-      @{tf.contrib.summary.SummaryWriter}.
+      `tf.contrib.summary.SummaryWriter`.
     ValueError: If session wasn't passed and no default session.
   """
   if context.executing_eagerly():
@@ -278,10 +278,10 @@
       Experiment will not be associated with a User. Must be valid as
       both a DNS label and Linux username.
     name: Shared name for this SummaryWriter resource stored to default
-      @{tf.Graph}.
+      `tf.Graph`.
 
   Returns:
-    A @{tf.contrib.summary.SummaryWriter} instance.
+    A `tf.contrib.summary.SummaryWriter` instance.
   """
   with ops.device("cpu:0"):
     if experiment_name is None:
@@ -328,7 +328,7 @@
 def all_summary_ops():
   """Graph-mode only. Returns all summary ops.
 
-  Please note this excludes @{tf.contrib.summary.graph} ops.
+  Please note this excludes `tf.contrib.summary.graph` ops.
 
   Returns:
     The summary ops.
@@ -410,20 +410,20 @@
 def scalar(name, tensor, family=None, step=None):
   """Writes a scalar summary if possible.
 
-  Unlike @{tf.contrib.summary.generic} this op may change the dtype
+  Unlike `tf.contrib.summary.generic` this op may change the dtype
   depending on the writer, for both practical and efficiency concerns.
 
   Args:
     name: An arbitrary name for this summary.
-    tensor: A @{tf.Tensor} Must be one of the following types:
+    tensor: A `tf.Tensor` Must be one of the following types:
       `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`,
       `int8`, `uint16`, `half`, `uint32`, `uint64`.
     family: Optional, the summary's family.
     step: The `int64` monotonic step variable, which defaults
-      to @{tf.train.get_global_step}.
+      to `tf.train.get_global_step`.
 
   Returns:
-    The created @{tf.Operation} or a @{tf.no_op} if summary writing has
+    The created `tf.Operation` or a `tf.no_op` if summary writing has
     not been enabled for this context.
   """
 
@@ -494,31 +494,31 @@
   """Writes a TensorFlow graph to the summary interface.
 
   The graph summary is, strictly speaking, not a summary. Conditions
-  like @{tf.contrib.summary.never_record_summaries} do not apply. Only
+  like `tf.contrib.summary.never_record_summaries` do not apply. Only
   a single graph can be associated with a particular run. If multiple
   graphs are written, then only the last one will be considered by
   TensorBoard.
 
   When not using eager execution mode, the user should consider passing
-  the `graph` parameter to @{tf.contrib.summary.initialize} instead of
+  the `graph` parameter to `tf.contrib.summary.initialize` instead of
   calling this function. Otherwise special care needs to be taken when
   using the graph to record the graph.
 
   Args:
-    param: A @{tf.Tensor} containing a serialized graph proto. When
+    param: A `tf.Tensor` containing a serialized graph proto. When
       eager execution is enabled, this function will automatically
-      coerce @{tf.Graph}, @{tf.GraphDef}, and string types.
+      coerce `tf.Graph`, `tf.GraphDef`, and string types.
     step: The global step variable. This doesn't have useful semantics
       for graph summaries, but is used anyway, due to the structure of
       event log files. This defaults to the global step.
     name: A name for the operation (optional).
 
   Returns:
-    The created @{tf.Operation} or a @{tf.no_op} if summary writing has
+    The created `tf.Operation` or a `tf.no_op` if summary writing has
     not been enabled for this context.
 
   Raises:
-    TypeError: If `param` isn't already a @{tf.Tensor} in graph mode.
+    TypeError: If `param` isn't already a `tf.Tensor` in graph mode.
   """
   if not context.executing_eagerly() and not isinstance(param, ops.Tensor):
     raise TypeError("graph() needs a tf.Tensor (e.g. tf.placeholder) in graph "
@@ -539,21 +539,21 @@
 
 
 def import_event(tensor, name=None):
-  """Writes a @{tf.Event} binary proto.
+  """Writes a `tf.Event` binary proto.
 
   When using create_db_writer(), this can be used alongside
-  @{tf.TFRecordReader} to load event logs into the database. Please
+  `tf.TFRecordReader` to load event logs into the database. Please
   note that this is lower level than the other summary functions and
   will ignore any conditions set by methods like
-  @{tf.contrib.summary.should_record_summaries}.
+  `tf.contrib.summary.should_record_summaries`.
 
   Args:
-    tensor: A @{tf.Tensor} of type `string` containing a serialized
-      @{tf.Event} proto.
+    tensor: A `tf.Tensor` of type `string` containing a serialized
+      `tf.Event` proto.
     name: A name for the operation (optional).
 
   Returns:
-    The created @{tf.Operation}.
+    The created `tf.Operation`.
   """
   return gen_summary_ops.import_event(
       context.context().summary_writer_resource, tensor, name=name)
@@ -565,13 +565,13 @@
   This operation blocks until that finishes.
 
   Args:
-    writer: The @{tf.contrib.summary.SummaryWriter} resource to flush.
+    writer: The `tf.contrib.summary.SummaryWriter` resource to flush.
       The thread default will be used if this parameter is None.
-      Otherwise a @{tf.no_op} is returned.
+      Otherwise a `tf.no_op` is returned.
     name: A name for the operation (optional).
 
   Returns:
-    The created @{tf.Operation}.
+    The created `tf.Operation`.
   """
   if writer is None:
     writer = context.context().summary_writer_resource
@@ -593,7 +593,7 @@
 
 
 def create_summary_file_writer(*args, **kwargs):
-  """Please use @{tf.contrib.summary.create_file_writer}."""
+  """Please use `tf.contrib.summary.create_file_writer`."""
   logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
                   "to create_file_writer")
   return create_file_writer(*args, **kwargs)
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 161d968..e7ad261 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -128,7 +128,7 @@
       template of the same scope/unique_name already exists and reuse is false,
       an error is raised. Defaults to None.
     custom_getter_: Optional custom getter for variables used in `func_`. See
-      the @{tf.get_variable} `custom_getter` documentation for
+      the `tf.get_variable` `custom_getter` documentation for
       more information.
     **kwargs: Keyword arguments to apply to `func_`.
 
@@ -176,7 +176,7 @@
       template of the same scope/unique_name already exists and reuse is false,
       an error is raised. Defaults to None. If executing eagerly, must be None.
     custom_getter_: Optional custom getter for variables used in `func_`. See
-      the @{tf.get_variable} `custom_getter` documentation for
+      the `tf.get_variable` `custom_getter` documentation for
       more information.
     create_graph_function_: When True, `func_` will be executed as a graph
       function. This implies that `func_` must satisfy the properties that
@@ -298,9 +298,10 @@
 
   def _call_func(self, args, kwargs):
     try:
-      vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      vars_at_start = len(
+          ops.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
-          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+          ops.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES))
       if self._variables_created:
         result = self._func(*args, **kwargs)
       else:
@@ -313,7 +314,7 @@
         # Variables were previously created, implying this is not the first
         # time the template has been called. Check to make sure that no new
         # trainable variables were created this time around.
-        trainable_variables = ops.get_collection(
+        trainable_variables = ops.get_collection_ref(
             ops.GraphKeys.TRAINABLE_VARIABLES)
         # If a variable that we intend to train is created as a side effect
         # of creating a template, then that is almost certainly an error.
@@ -326,7 +327,7 @@
         # Non-trainable tracking variables are a legitimate reason why a new
         # variable would be created, but it is a relatively advanced use-case,
         # so log it.
-        variables = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        variables = ops.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES)
         if vars_at_start != len(variables):
           logging.info("New variables created when calling a template after "
                        "the first time, perhaps you used tf.Variable when you "
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index aca44bc..46bcd68 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -314,13 +315,13 @@
         use when doing asynchronous distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
-        @{tf.VariableSynchronization}. By default the synchronization is set to
+        `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize. If `synchronization` is set to `ON_READ`,
         `trainable` must not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
-        @{tf.VariableAggregation}.
+        `tf.VariableAggregation`.
 
     Returns:
       The created or existing `Variable` (or `PartitionedVariable`, if a
@@ -837,9 +838,6 @@
       raise ValueError("Variable %s does not exist, or was not created with "
                        "tf.get_variable(). Did you mean to set "
                        "reuse=tf.AUTO_REUSE in VarScope?" % name)
-    if not shape.is_fully_defined() and not initializing_from_value:
-      raise ValueError("Shape of a new variable (%s) must be fully defined, "
-                       "but instead was %s." % (name, shape))
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
@@ -854,8 +852,17 @@
         # Instantiate initializer if provided initializer is a type object.
         if isinstance(initializer, type(init_ops.Initializer)):
           initializer = initializer(dtype=dtype)
-        init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-            shape.as_list(), dtype=dtype, partition_info=partition_info)
+        if shape and shape.is_fully_defined():
+          init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+              shape.as_list(), dtype=dtype, partition_info=partition_info)
+        elif not tf_inspect.getargspec(initializer).args:
+          init_val = initializer
+        else:
+          raise ValueError("You can only pass an initializer function that"
+                           "expects no arguments to its callable when the "
+                           "shape is not fully defined. The given initializer "
+                           "function expects the following args %s" %
+                           tf_inspect.getargspec(initializer).args)
         variable_dtype = dtype.base_dtype
 
     # Create the variable.
@@ -1440,12 +1447,11 @@
       aggregation=aggregation)
 
 
-get_variable_or_local_docstring = (
-    """%s
+get_variable_or_local_docstring = ("""%s
 
 %sThis function prefixes the name with the current variable scope
 and performs reuse checks. See the
-@{$variables$Variable Scope How To}
+[Variable Scope How To](https://tensorflow.org/guide/variables)
 for an extensive description of how reusing works. Here is a basic example:
 
 ```python
@@ -1484,7 +1490,7 @@
     unless validate_shape is False.
   regularizer: A (Tensor -> Tensor or None) function; the result of
     applying it on a newly created variable will be added to the collection
-    @{tf.GraphKeys.REGULARIZATION_LOSSES} and can be used for regularization.
+    `tf.GraphKeys.REGULARIZATION_LOSSES` and can be used for regularization.
   %scollections: List of graph collections keys to add the Variable to.
     Defaults to `[%s]` (see `tf.Variable`).
   caching_device: Optional device string or function describing where the
@@ -1895,8 +1901,8 @@
 
   Variable scope allows you to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
-  see the @{$variables$Variable Scope How To}, here we present only a few basic
-  examples.
+  see the [Variable Scope How To](https://tensorflow.org/guide/variables), here
+  we present only a few basic examples.
 
   Simple example of how to create a new variable:
 
@@ -2445,13 +2451,13 @@
       use_resource: if True, a ResourceVariable is always created.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
-        @{tf.VariableSynchronization}. By default the synchronization is set to
+        `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize. If `synchronization` is set to `ON_READ`,
         `trainable` must not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
-        @{tf.VariableAggregation}.
+        `tf.VariableAggregation`.
 
   This set may grow over time, so it's important the signature of creators is as
   mentioned above.
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index fc00ce6..7a28615 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -135,7 +135,7 @@
 @tf_export("Variable")
 class Variable(six.with_metaclass(VariableMetaclass,
                                   checkpointable.CheckpointableBase)):
-  """See the @{$variables$Variables How To} for a high level overview.
+  """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
   variable to the graph by constructing an instance of the class `Variable`.
@@ -320,13 +320,13 @@
        a resource variable is always created.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
-        @{tf.VariableSynchronization}. By default the synchronization is set to
+        `tf.VariableSynchronization`. By default the synchronization is set to
         `AUTO` and the current `DistributionStrategy` chooses
         when to synchronize. If `synchronization` is set to `ON_READ`,
         `trainable` must not be set to `True`.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
-        @{tf.VariableAggregation}.
+        `tf.VariableAggregation`.
 
     Raises:
       ValueError: If both `variable_def` and initial_value are specified.
@@ -388,7 +388,7 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See @{tf.Session} for more
+    passed, the default session is used.  See `tf.Session` for more
     information on launching a graph and on sessions.
 
     ```python
@@ -551,7 +551,7 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See @{tf.Session} for more
+    passed, the default session is used.  See `tf.Session` for more
     information on launching a graph and on sessions.
 
     ```python
@@ -1106,7 +1106,7 @@
   def _AsTensor(self):  # pylint: disable=invalid-name
     """Converts this variable to a Tensor.
 
-    See @{tf.Variable.value}.
+    See `tf.Variable.value`.
 
     Returns:
       A `Tensor` containing the value of the variable.
@@ -1163,7 +1163,7 @@
 
     Returns is a `Tensor` which holds a reference to the variable.  You can
     assign a new value to the variable by passing the tensor to an assign op.
-    See @{tf.Variable.value} if you want to get the value of the
+    See `tf.Variable.value` if you want to get the value of the
     variable.
 
     Returns:
@@ -1191,7 +1191,7 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See @{tf.Session} for more
+    passed, the default session is used.  See `tf.Session` for more
     information on launching a graph and on sessions.
 
     ```python
@@ -1386,7 +1386,7 @@
 
     This convenience method requires a session where the graph
     containing this variable has been launched. If no session is
-    passed, the default session is used.  See @{tf.Session} for more
+    passed, the default session is used.  See `tf.Session` for more
     information on launching a graph and on sessions.
 
     ```python
@@ -1917,15 +1917,10 @@
   def as_tensor(self):
     """Returns the overall concatenated value as a `Tensor`.
 
-    The returned tensor will not inherit the control dependencies from the scope
-    where the value is used, which is similar to getting the value of
-    `Variable`.
-
     Returns:
       `Tensor` containing the concatenated value.
     """
-    with ops.control_dependencies(None):
-      return self._concat()
+    return self._concat()
 
   @staticmethod
   def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):
@@ -1979,7 +1974,7 @@
   This convenience function returns the contents of that collection.
 
   An alternative to global variables are local variables. See
-  @{tf.local_variables}
+  `tf.local_variables`
 
   Args:
     scope: (Optional.) A string. If supplied, the resulting list is filtered
@@ -2032,7 +2027,7 @@
   This convenience function returns the contents of that collection.
 
   An alternative to local variables are global variables. See
-  @{tf.global_variables}
+  `tf.global_variables`
 
   Args:
     scope: (Optional.) A string. If supplied, the resulting list is filtered
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 9ffb48c..5dc4037 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -15,7 +15,7 @@
 
 """Testing.
 
-See the @{$python/test} guide.
+See the [Testing](https://tensorflow.org/api_guides/python/test) guide.
 
 Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
 depending on the python version.
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 1b69e0d..157f234 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -63,6 +63,8 @@
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 %rename("%s") TFE_Py_TensorShapeOnDevice;
+%rename("%s") TFE_ContextStartStep;
+%rename("%s") TFE_ContextEndStep;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 076f2d8..7a37eda 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -62,6 +62,7 @@
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
@@ -81,6 +82,7 @@
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
@@ -187,8 +189,10 @@
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":constants",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lib",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:util",
     ],
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 8c985a7..8e7f123 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -32,6 +32,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
@@ -112,12 +113,8 @@
       tf_logging.info("No assets to write.")
       return
 
-    assets_destination_dir = os.path.join(
-        compat.as_bytes(self._export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
-
-    if not file_io.file_exists(assets_destination_dir):
-      file_io.recursive_create_dir(assets_destination_dir)
+    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+        self._export_dir)
 
     # Copy each asset from source path to destination path.
     for asset_basename, asset_source_filepath in asset_filename_map.items():
@@ -409,16 +406,8 @@
     # Add assets and ops
     self._add_collections(assets_collection, main_op, None)
 
-    # Create the variables sub-directory, if it does not exist.
-    variables_dir = os.path.join(
-        compat.as_text(self._export_dir),
-        compat.as_text(constants.VARIABLES_DIRECTORY))
-    if not file_io.file_exists(variables_dir):
-      file_io.recursive_create_dir(variables_dir)
-
-    variables_path = os.path.join(
-        compat.as_text(variables_dir),
-        compat.as_text(constants.VARIABLES_FILENAME))
+    saved_model_utils.get_or_create_variables_dir(self._export_dir)
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
 
     saver = self._maybe_create_saver(saver)
 
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 16077f5..e853610 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -31,6 +31,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
@@ -203,10 +204,7 @@
         variables to be loaded are located.
     """
     self._export_dir = export_dir
-    self._variables_path = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.VARIABLES_DIRECTORY),
-        compat.as_bytes(constants.VARIABLES_FILENAME))
+    self._variables_path = saved_model_utils.get_variables_path(export_dir)
     self._saved_model = _parse_saved_model(export_dir)
 
   @property
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index cddce29..20ff34f 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -18,10 +18,15 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -84,3 +89,45 @@
         _get_tensor(tensor_info.coo_sparse.dense_shape_tensor_name))
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
+
+
+# Path helpers.
+
+
+def get_or_create_variables_dir(export_dir):
+  """Return variables sub-directory, or create one if it doesn't exist."""
+  variables_dir = get_variables_dir(export_dir)
+  if not file_io.file_exists(variables_dir):
+    file_io.recursive_create_dir(variables_dir)
+  return variables_dir
+
+
+def get_variables_dir(export_dir):
+  """Return variables sub-directory in the SavedModel."""
+  return os.path.join(
+      compat.as_text(export_dir),
+      compat.as_text(constants.VARIABLES_DIRECTORY))
+
+
+def get_variables_path(export_dir):
+  """Return the variables path, used as the prefix for checkpoint files."""
+  return os.path.join(
+      compat.as_text(get_variables_dir(export_dir)),
+      compat.as_text(constants.VARIABLES_FILENAME))
+
+
+def get_or_create_assets_dir(export_dir):
+  """Return assets sub-directory, or create one if it doesn't exist."""
+  assets_destination_dir = get_assets_dir(export_dir)
+
+  if not file_io.file_exists(assets_destination_dir):
+    file_io.recursive_create_dir(assets_destination_dir)
+
+  return assets_destination_dir
+
+
+def get_assets_dir(export_dir):
+  """Return path to asset directory in the SavedModel."""
+  return os.path.join(
+      compat.as_text(export_dir),
+      compat.as_text(constants.ASSETS_DIRECTORY))
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 1421d27..fbae2b7 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -15,7 +15,7 @@
 
 """Tensor summaries for exporting information about a model.
 
-See the @{$python/summary} guide.
+See the [Summary](https://tensorflow.org/api_guides/python/summary) guide.
 """
 
 from __future__ import absolute_import
@@ -268,7 +268,7 @@
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
   # pylint: enable=line-too-long
   if _context.executing_eagerly():
@@ -285,7 +285,7 @@
 
 
 @tf_export('summary.merge_all')
-def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
+def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   """Merges all summaries collected in the default graph.
 
   Args:
@@ -304,7 +304,7 @@
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
   if _context.executing_eagerly():
     raise RuntimeError(
@@ -314,7 +314,7 @@
   if not summary_ops:
     return None
   else:
-    return merge(summary_ops)
+    return merge(summary_ops, name=name)
 
 
 @tf_export('summary.get_summary_description')
@@ -336,7 +336,7 @@
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
 
   if node_def.op != 'TensorSummary':
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 60e96ee..16b8626 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -104,8 +104,8 @@
     and adds it to the event file.
 
     You can pass the result of evaluating any summary op, using
-    @{tf.Session.run} or
-    @{tf.Tensor.eval}, to this
+    `tf.Session.run` or
+    `tf.Tensor.eval`, to this
     function. Alternatively, you can pass a `tf.Summary` protocol
     buffer that you populate with your own data. The latter is
     commonly done to report evaluation results in event files.
@@ -352,7 +352,7 @@
     @compatibility(eager)
     `FileWriter` is not compatible with eager execution. To write TensorBoard
     summaries under eager execution, use `tf.contrib.summary` instead.
-    @end_compatbility
+    @end_compatibility
     """
     if context.executing_eagerly():
       raise RuntimeError(
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 130fe70..c7f414c 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -59,6 +59,21 @@
 from tensorflow.python.training import saver as saver_lib
 
 
+def _has_no_variables(sess):
+  """Determines if the graph has any variables.
+
+  Args:
+    sess: TensorFlow Session.
+
+  Returns:
+    Bool.
+  """
+  for op in sess.graph.get_operations():
+    if op.type.startswith("Variable") or op.type.endswith("VariableOp"):
+      return False
+  return True
+
+
 def freeze_graph_with_def_protos(input_graph_def,
                                  input_saver_def,
                                  input_checkpoint,
@@ -152,6 +167,11 @@
                 "from checkpoint files. Please pass in a SavedModel using "
                 "the flag --input_saved_model_dir.")
           return -1
+        # Models that have been frozen previously do not contain Variables.
+        elif _has_no_variables(sess):
+          print("No variables were found in this model. It is likely the model "
+                "was frozen previously. You cannot freeze a graph twice.")
+          return 0
         else:
           raise e
 
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index 6778f3c..3508b98 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -70,20 +70,24 @@
 
   def _create_slots(self, var_list):
     for v in var_list:
-      with ops.colocate_with(v):
-        dtype = v.dtype.base_dtype
-        if v.get_shape().is_fully_defined():
-          init = init_ops.constant_initializer(self._initial_accumulator_value,
-                                               dtype=dtype)
-        else:
-          # Use a Tensor instead of initializer if variable does not have static
-          # shape.
-          init_constant = gen_array_ops.fill(array_ops.shape(v),
-                                             self._initial_accumulator_value)
-          init = math_ops.cast(init_constant, dtype)
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        init = self._init_constant_op(v, dtype)
       self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype,
                                               "accumulator", self._name)
 
+  def _init_constant_op(self, v, dtype):
+    def init():
+      # Use a Tensor instead of initializer if variable does not have
+      # static shape.
+      init_constant = gen_array_ops.fill(array_ops.shape(v),
+                                         self._initial_accumulator_value)
+      return math_ops.cast(init_constant, dtype)
+    return init
+
   def _prepare(self):
     learning_rate = self._call_if_callable(self._learning_rate)
     self._learning_rate_tensor = ops.convert_to_tensor(
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index c9aec33..4e634ff 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -302,6 +302,39 @@
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
+  def testDynamicShapeVariableWithCallableInit(self):
+    var0 = variable_scope.get_variable("var0",
+                                       initializer=constant_op.constant(1.),
+                                       validate_shape=False)
+    self.assertFalse(var0.shape.is_fully_defined())
+
+    grads0 = constant_op.constant(0.1, dtype=dtypes.float32)
+    learning_rate = lambda: 3.0
+
+    ada_opt = adagrad.AdagradOptimizer(
+        learning_rate, initial_accumulator_value=0.1, use_locking=True)
+
+    if not context.executing_eagerly():
+      ada_update = ada_opt.apply_gradients(
+          zip([grads0], [var0]))
+      self.evaluate(variables.global_variables_initializer())
+
+    # Fetch params to validate initial values
+    v0_val = self.evaluate([var0])
+    self.assertAllClose([1.0], v0_val)
+
+    # Run 3 steps of adagrad
+    for _ in range(3):
+      if not context.executing_eagerly():
+        self.evaluate(ada_update)
+      else:
+        ada_opt.apply_gradients(zip([grads0], [var0]))
+
+    # Validate updated params
+    v0_val = self.evaluate([var0])
+    self.assertAllCloseAccordingToType(
+        np.array([-1.6026098728179932]), v0_val)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 4e8e505..7662562 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -28,9 +28,12 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import session_run_hook
@@ -40,6 +43,10 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
+_HOOKS = "hooks"
+_STEPS_PER_RUN_VAR = "steps_per_run"
+
+
 class _HookTimer(object):
   """Base timer for determining when Hooks should trigger.
 
@@ -255,6 +262,116 @@
       self._log_tensors(values)
 
 
+def get_or_create_steps_per_run_variable():
+  """Gets or creates the steps_per_run variable.
+
+  In Estimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each
+  device program execution and before the next execution.
+
+  The purpose of using a variable, rather than a constant, is to allow
+  Estimator adapt the device training iterations according to the final steps
+  specified by users. For example, if the user sets the steps_per_run as
+  4 and steps as 10 in Estimator.train(), the steps_per_run
+  variable will have the following value before each training run.
+
+      - 1-st execution: steps_per_run = 4
+      - 2-nd execution: steps_per_run = 4
+      - 3-rd execution: steps_per_run = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi steps_per_run variables were found.
+  """
+  graph = ops.get_default_graph()
+  collection_name = "{}_{}".format(_HOOKS, _STEPS_PER_RUN_VAR)
+  steps_per_run_vars = graph.get_collection(collection_name)
+  if len(steps_per_run_vars) == 1:
+    return steps_per_run_vars[0]
+  elif len(steps_per_run_vars) > 1:
+    raise RuntimeError("Multiple steps_per_run_var in collection.")
+
+  with variable_scope.variable_scope(_HOOKS, reuse=variable_scope.AUTO_REUSE):
+    return variable_scope.get_variable(
+        _STEPS_PER_RUN_VAR,
+        initializer=init_ops.ones_initializer(),
+        shape=[],
+        dtype=dtypes.int32,
+        trainable=False,
+        collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
+        use_resource=True)
+
+
+class _MultiStepStopAtStepHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step."""
+
+  def __init__(self, num_steps=None, last_step=None, steps_per_run=1):
+    """Initializes a `MultiStepStopAtStepHook`.
+
+    This hook requests stop after either a number of steps have been
+    executed or a last step has been reached. Only one of the two options can be
+    specified.
+
+    if `num_steps` is specified, it indicates the number of steps to execute
+    after `begin()` is called. If instead `last_step` is specified, it
+    indicates the last step we want to execute, as passed to the `after_run()`
+    call.
+
+    In Estimator, the user provided computation, the model_fn, is wrapped
+    inside a tf.while_loop for peak performance. The steps_per_run variable
+    determines the number of iterations of the loop before returning to the CPU.
+
+    Args:
+      num_steps: Number of steps to execute.
+      last_step: Step after which to stop.
+      steps_per_run: Number of steps executed per run call.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if num_steps is None and last_step is None:
+      raise ValueError("One of num_steps or last_step must be specified.")
+    if num_steps is not None and last_step is not None:
+      raise ValueError("Only one of num_steps or last_step can be specified.")
+    if steps_per_run is None or steps_per_run < 1:
+      raise ValueError("steps_per_run should be greater than 0")
+    self._num_steps = num_steps
+    self._last_step = last_step
+    self._steps_per_run = steps_per_run
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError("Global step should be created to use StopAtStepHook.")
+    self._steps_per_run_variable = get_or_create_steps_per_run_variable()
+
+  def _update_steps_per_run_variable(self, global_step, session):
+    steps = min(self._last_step - global_step, self._steps_per_run)
+    self._steps_per_run_variable.load(steps, session=session)
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    if self._last_step is None:
+      self._last_step = global_step + self._num_steps
+    self._update_steps_per_run_variable(global_step, session)
+
+  def after_run(self, run_context, run_values):
+    # Global step cannot be retrieved via SessionRunArgs and before_run due to
+    # race condition in hook execution.
+    global_step = run_context.session.run(self._global_step_tensor)
+    if global_step >= self._last_step:
+      run_context.request_stop()
+    else:
+      self._update_steps_per_run_variable(global_step, run_context.session)
+
+
 @tf_export("train.StopAtStepHook")
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index aaddc01..85f29043 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -19,16 +19,23 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os.path
 import re
+import time
 
 from google.protobuf import text_format
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -51,7 +58,9 @@
 @tf_export("train.generate_checkpoint_state_proto")
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
-                                    all_model_checkpoint_paths=None):
+                                    all_model_checkpoint_paths=None,
+                                    all_model_checkpoint_timestamps=None,
+                                    last_preserved_timestamp=None):
   """Generates a checkpoint state proto.
 
   Args:
@@ -61,11 +70,20 @@
       checkpoints, sorted from oldest to newest.  If this is a non-empty list,
       the last element must be equal to model_checkpoint_path.  These paths
       are also saved in the CheckpointState proto.
-
+    all_model_checkpoint_timestamps: A list of floats, indicating the number of
+      seconds since the Epoch when each checkpoint was generated.
+    last_preserved_timestamp: A float, indicating the number of seconds since
+      the Epoch when the last preserved checkpoint was written, e.g. due to a
+      `keep_checkpoint_every_n_hours` parameter (see
+      `tf.contrib.checkpoint.CheckpointManager` for an implementation).
   Returns:
     CheckpointState proto with model_checkpoint_path and
     all_model_checkpoint_paths updated to either absolute paths or
     relative paths to the current save_dir.
+
+  Raises:
+    ValueError: If `all_model_checkpoint_timestamps` was provided but its length
+      does not match `all_model_checkpoint_paths`.
   """
   if all_model_checkpoint_paths is None:
     all_model_checkpoint_paths = []
@@ -76,6 +94,14 @@
                  model_checkpoint_path)
     all_model_checkpoint_paths.append(model_checkpoint_path)
 
+  if (all_model_checkpoint_timestamps
+      and (len(all_model_checkpoint_timestamps)
+           != len(all_model_checkpoint_paths))):
+    raise ValueError(
+        ("Checkpoint timestamps, if provided, must match checkpoint paths (got "
+         "paths %s and timestamps %s)")
+        % (all_model_checkpoint_paths, all_model_checkpoint_timestamps))
+
   # Relative paths need to be rewritten to be relative to the "save_dir"
   # if model_checkpoint_path already contains "save_dir".
   if not os.path.isabs(save_dir):
@@ -88,7 +114,9 @@
 
   coord_checkpoint_proto = CheckpointState(
       model_checkpoint_path=model_checkpoint_path,
-      all_model_checkpoint_paths=all_model_checkpoint_paths)
+      all_model_checkpoint_paths=all_model_checkpoint_paths,
+      all_model_checkpoint_timestamps=all_model_checkpoint_timestamps,
+      last_preserved_timestamp=last_preserved_timestamp)
 
   return coord_checkpoint_proto
 
@@ -97,7 +125,9 @@
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
-                            latest_filename=None):
+                            latest_filename=None,
+                            all_model_checkpoint_timestamps=None,
+                            last_preserved_timestamp=None):
   """Updates the content of the 'checkpoint' file.
 
   This updates the checkpoint file containing a CheckpointState
@@ -112,7 +142,13 @@
       are also saved in the CheckpointState proto.
     latest_filename: Optional name of the checkpoint file.  Default to
       'checkpoint'.
-
+    all_model_checkpoint_timestamps: Optional list of timestamps (floats,
+      seconds since the Epoch) indicating when the checkpoints in
+      `all_model_checkpoint_paths` were created.
+    last_preserved_timestamp: A float, indicating the number of seconds since
+      the Epoch when the last preserved checkpoint was written, e.g. due to a
+      `keep_checkpoint_every_n_hours` parameter (see
+      `tf.contrib.checkpoint.CheckpointManager` for an implementation).
   Raises:
     RuntimeError: If any of the model checkpoint paths conflict with the file
       containing CheckpointSate.
@@ -122,14 +158,18 @@
       model_checkpoint_path=model_checkpoint_path,
       all_model_checkpoint_paths=all_model_checkpoint_paths,
       latest_filename=latest_filename,
-      save_relative_paths=False)
+      save_relative_paths=False,
+      all_model_checkpoint_timestamps=all_model_checkpoint_timestamps,
+      last_preserved_timestamp=last_preserved_timestamp)
 
 
 def update_checkpoint_state_internal(save_dir,
                                      model_checkpoint_path,
                                      all_model_checkpoint_paths=None,
                                      latest_filename=None,
-                                     save_relative_paths=False):
+                                     save_relative_paths=False,
+                                     all_model_checkpoint_timestamps=None,
+                                     last_preserved_timestamp=None):
   """Updates the content of the 'checkpoint' file.
 
   This updates the checkpoint file containing a CheckpointState
@@ -146,6 +186,13 @@
       'checkpoint'.
     save_relative_paths: If `True`, will write relative paths to the checkpoint
       state file.
+    all_model_checkpoint_timestamps: Optional list of timestamps (floats,
+      seconds since the Epoch) indicating when the checkpoints in
+      `all_model_checkpoint_paths` were created.
+    last_preserved_timestamp: A float, indicating the number of seconds since
+      the Epoch when the last preserved checkpoint was written, e.g. due to a
+      `keep_checkpoint_every_n_hours` parameter (see
+      `tf.contrib.checkpoint.CheckpointManager` for an implementation).
 
   Raises:
     RuntimeError: If any of the model checkpoint paths conflict with the file
@@ -168,12 +215,16 @@
     ckpt = generate_checkpoint_state_proto(
         save_dir,
         rel_model_checkpoint_path,
-        all_model_checkpoint_paths=rel_all_model_checkpoint_paths)
+        all_model_checkpoint_paths=rel_all_model_checkpoint_paths,
+        all_model_checkpoint_timestamps=all_model_checkpoint_timestamps,
+        last_preserved_timestamp=last_preserved_timestamp)
   else:
     ckpt = generate_checkpoint_state_proto(
         save_dir,
         model_checkpoint_path,
-        all_model_checkpoint_paths=all_model_checkpoint_paths)
+        all_model_checkpoint_paths=all_model_checkpoint_paths,
+        all_model_checkpoint_timestamps=all_model_checkpoint_timestamps,
+        last_preserved_timestamp=last_preserved_timestamp)
 
   if coord_checkpoint_filename == ckpt.model_checkpoint_path:
     raise RuntimeError("Save path '%s' conflicts with path used for "
@@ -404,3 +455,227 @@
   basename = re.sub(r"-[\d\?]+-of-\d+$", "", checkpoint_filename)
   suffixed_filename = ".".join([basename, meta_graph_suffix])
   return suffixed_filename
+
+
+# TODO(allenl): Allow tf.keras.Model instances in the constructor directly?
+class CheckpointManager(object):
+  """Deletes old checkpoints.
+
+  Example usage:
+  ```python
+  import tensorflow as tf
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  manager = tf.contrib.checkpoint.CheckpointManager(
+      checkpoint, directory="/tmp/model", max_to_keep=5)
+  status = checkpoint.restore(manager.latest_checkpoint)
+  while True:
+    # train
+    manager.save()
+  ```
+
+  `CheckpointManager` preserves its own state across instantiations (see the
+  `__init__` documentation for details). Only one should be active in a
+  particular directory at a time.
+  """
+
+  def __init__(self, checkpoint, directory,
+               max_to_keep, keep_checkpoint_every_n_hours=None):
+    """Configure a `CheckpointManager` for use in `directory`.
+
+    If a `CheckpointManager` was previously used in `directory`, its
+    state will be restored. This includes the list of managed checkpoints and
+    the timestamp bookkeeping necessary to support
+    `keep_checkpoint_every_n_hours`. The behavior of the new `CheckpointManager`
+    will be the same as the previous `CheckpointManager`, including cleaning up
+    existing checkpoints if appropriate.
+
+    Checkpoints are only considered for deletion just after a new checkpoint has
+    been added. At that point, `max_to_keep` checkpoints will remain in an
+    "active set". Once a checkpoint is preserved by
+    `keep_checkpoint_every_n_hours` it will not be deleted by this
+    `CheckpointManager` or any future `CheckpointManager` instantiated in
+    `directory` (regardless of the new setting of
+    `keep_checkpoint_every_n_hours`). The `max_to_keep` checkpoints in the
+    active set may be deleted by this `CheckpointManager` or a future
+    `CheckpointManager` instantiated in `directory` (subject to its
+    `max_to_keep` and `keep_checkpoint_every_n_hours` settings).
+
+    Args:
+      checkpoint: The `tf.train.Checkpoint` instance to save and manage
+        checkpoints for.
+      directory: The path to a directory in which to write checkpoints. A
+        special file named "checkpoint" is also written to this directory (in a
+        human-readable text format) which contains the state of the
+        `CheckpointManager`.
+      max_to_keep: An integer, the number of checkpoints to keep. Unless
+        preserved by `keep_checkpoint_every_n_hours`, checkpoints will be
+        deleted from the active set, oldest first, until only `max_to_keep`
+        checkpoints remain.
+      keep_checkpoint_every_n_hours: Upon removal from the active set, a
+        checkpoint will be preserved if it has been at least
+        `keep_checkpoint_every_n_hours` since the last preserved checkpoint. The
+        default setting of `None` does not preserve any checkpoints in this way.
+
+    Raises:
+      ValueError: If `max_to_keep` is not a positive integer.
+    """
+    self._checkpoint = checkpoint
+    self._save_counter_assign = None
+    if not max_to_keep or max_to_keep < 0:
+      raise ValueError(
+          "Expected a positive integer for `max_to_max_to_keep`, got %d."
+          % (max_to_keep,))
+    self._max_to_keep = max_to_keep
+    self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
+    self._directory = directory
+    self._checkpoint_prefix = os.path.join(directory, "ckpt")
+    recovered_state = get_checkpoint_state(directory)
+    current_clock = time.time()
+    self._maybe_delete = collections.OrderedDict()
+    if recovered_state is None:
+      self._latest_checkpoint = None
+      self._last_preserved_timestamp = current_clock
+    else:
+      self._latest_checkpoint = recovered_state.model_checkpoint_path
+      self._last_preserved_timestamp = recovered_state.last_preserved_timestamp
+      if current_clock < self._last_preserved_timestamp:
+        # Time seems to have reversed itself. In addition to this warning, we'll
+        # min() saved checkpoint timestamps with the current time to ensure that
+        # old checkpoints don't get deleted accidentally.
+        logging.warning(
+            ("time.time() returned a value %f seconds behind the last "
+             "preserved checkpoint timestamp.")
+            % (self._last_preserved_timestamp - current_clock,))
+        self._last_preserved_timestamp = current_clock
+      all_timestamps = recovered_state.all_model_checkpoint_timestamps
+      all_paths = recovered_state.all_model_checkpoint_paths
+      del recovered_state  # Uses modified values from now on
+      if not all_timestamps:
+        all_timestamps = [self._last_preserved_timestamp] * len(all_paths)
+
+      for filename, timestamp in zip(all_paths, all_timestamps):
+        timestamp = min(timestamp, current_clock)
+        if timestamp > self._last_preserved_timestamp:
+          self._maybe_delete[filename] = timestamp
+
+  @property
+  def latest_checkpoint(self):
+    """The prefix of the most recent checkpoint in `directory`.
+
+    Equivalent to `tf.train.latest_checkpoint(directory)` where `directory` is
+    the constructor argument to `CheckpointManager`.
+
+    Suitable for passing to `tf.train.Checkpoint.restore` to resume training.
+
+    Returns:
+      The checkpoint prefix. If there are no checkpoints, returns `None`.
+    """
+    return self._latest_checkpoint
+
+  @property
+  def checkpoints(self):
+    """A list of managed checkpoints.
+
+    Note that checkpoints saved due to `keep_checkpoint_every_n_hours` will not
+    show up in this list (to avoid ever-growing filename lists).
+
+    Returns:
+      A list of filenames, sorted from oldest to newest.
+    """
+    return list(self._maybe_delete.keys())
+
+  def _sweep(self):
+    """Deletes or preserves managed checkpoints."""
+    while len(self._maybe_delete) > self._max_to_keep:
+      filename, timestamp = self._maybe_delete.popitem(last=False)
+      # Even if we're keeping this checkpoint due to
+      # keep_checkpoint_every_n_hours, we won't reference it to avoid
+      # infinitely-growing CheckpointState protos.
+      if (self._keep_checkpoint_every_n_hours
+          and (timestamp - self._keep_checkpoint_every_n_hours * 3600.
+               >= self._last_preserved_timestamp)):
+        self._last_preserved_timestamp = timestamp
+        continue
+      remove_checkpoint(filename)
+
+  def _record_state(self):
+    """Saves the `CheckpointManager`'s state in `directory`."""
+    filenames, timestamps = zip(*self._maybe_delete.items())
+    update_checkpoint_state_internal(
+        self._directory,
+        model_checkpoint_path=self.latest_checkpoint,
+        all_model_checkpoint_paths=filenames,
+        all_model_checkpoint_timestamps=timestamps,
+        last_preserved_timestamp=self._last_preserved_timestamp,
+        save_relative_paths=True)
+
+  @property
+  def _prefix(self):
+    """A common prefix for all checkpoints saved with this manager.
+
+    For example, if `directory` (a constructor argument) were `"/tmp/tf-model"`,
+    `prefix` would be `"/tmp/tf-model/ckpt"` and checkpoints would generally be
+    numbered `"/tmp/tf-model/ckpt-1"`, `"/tmp/tf-model/ckpt-2"`, and so on. Each
+    checkpoint has several associated files
+    (e.g. `"/tmp/tf-model/ckpt-2.index"`).
+
+    Returns:
+      A string prefix.
+    """
+    return self._checkpoint_prefix
+
+  def save(self, session=None, checkpoint_number=None):
+    """Creates a new checkpoint and manages it.
+
+    Args:
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+      checkpoint_number: An optional integer, or an integer-dtype `Variable` or
+        `Tensor`, used to number the checkpoint. If `None` (default),
+        checkpoints are numbered using `checkpoint.save_counter`. Even if
+        `checkpoint_number` is provided, `save_counter` is still incremented. A
+        user-provided `checkpoint_number` is not incremented even if it is a
+        `Variable`.
+
+    Returns:
+      The path to the new checkpoint. It is also recorded in the `checkpoints`
+      and `latest_checkpoint` properies.
+    """
+    # Save counter logic duplicated from tf.train.Checkpoint, soon to diverge
+    # slightly with a custom numbering option.
+    if context.executing_eagerly():
+      save_counter = self._checkpoint.save_counter
+      save_counter.assign_add(1)
+    else:
+      if session is None:
+        session = ops.get_default_session()
+
+      def _initializing_creator(next_creator, **kwargs):
+        """Initialize the save counter if it has been newly created."""
+        v = next_creator(**kwargs)
+        session.run(v.initializer)
+        return v
+
+      with variable_scope.variable_creator_scope(_initializing_creator):
+        save_counter = self._checkpoint.save_counter
+      if self._save_counter_assign is None:
+        self._save_counter_assign = save_counter.assign_add(1, read_value=False)
+      session.run(self._save_counter_assign)
+    if checkpoint_number is None:
+      checkpoint_number = save_counter
+    if not isinstance(checkpoint_number, compat.integral_types):
+      checkpoint_number = training_util.global_step(
+          sess=session, global_step_tensor=checkpoint_number)
+    prefix = "%s-%d" % (self._prefix, checkpoint_number)
+    save_path = self._checkpoint.write(prefix)
+    timestamp = time.time()
+    # If this is an overwritten checkpoint we were previously tracking, delete
+    # and reinsert it to make sure it goes to the end of the queue.
+    if save_path in self._maybe_delete:
+      del self._maybe_delete[save_path]
+    self._maybe_delete[save_path] = timestamp
+    self._latest_checkpoint = save_path
+    self._sweep()
+    self._record_state()
+    return save_path
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 4b31d0c..1e2827d 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -26,14 +26,18 @@
 from google.protobuf import text_format
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.training.checkpointable import util
 
 
 class LatestCheckpointWithRelativePaths(test.TestCase):
@@ -312,5 +316,202 @@
           self.assertFalse(checkpoint_management.checkpoint_exists(ckpt_prefix))
 
 
+class CheckpointManagerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeletion(self):
+    checkpoint = util.Checkpoint()
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, self.get_temp_dir(), max_to_keep=3)
+    first_path = manager.save()
+    second_path = manager.save()
+    third_path = manager.save()
+    fourth_path = manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
+
+  @test_util.run_in_graph_and_eager_modes
+  @test.mock.patch.object(checkpoint_management, "time")
+  def testSaveRestoreState(self, mock_time):
+    directory = self.get_temp_dir()
+    mock_time.time.return_value = 3.
+    checkpoint = util.Checkpoint()
+    first_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2)
+    first_time = 10000.
+    first_name = os.path.join(directory, "ckpt-1")
+    mock_time.time.return_value = first_time
+    first_manager.save()
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual([first_time], state.all_model_checkpoint_timestamps)
+    self.assertEqual(3., state.last_preserved_timestamp)
+    second_time = first_time + 3610.
+    second_name = os.path.join(directory, "ckpt-2")
+    mock_time.time.return_value = second_time
+    first_manager.save()
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual([first_time, second_time],
+                     state.all_model_checkpoint_timestamps)
+    self.assertEqual(3., state.last_preserved_timestamp)
+    self.assertEqual([first_name, second_name], first_manager.checkpoints)
+    self.assertEqual(second_name, first_manager.latest_checkpoint)
+    del first_manager
+
+    second_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory,
+        max_to_keep=2, keep_checkpoint_every_n_hours=1.5)
+    self.assertEqual([first_name, second_name], second_manager.checkpoints)
+    self.assertEqual(second_name, second_manager.latest_checkpoint)
+    third_name = os.path.join(directory, "ckpt-3")
+    third_time = second_time + 3600. * 0.2
+    mock_time.time.return_value = third_time
+    second_manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_name))
+    self.assertEqual([second_name, third_name],
+                     second_manager.checkpoints)
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual(first_time, state.last_preserved_timestamp)
+    fourth_time = third_time + 3600. * 0.5
+    mock_time.time.return_value = fourth_time
+    fourth_name = os.path.join(directory, "ckpt-4")
+    second_manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
+    self.assertFalse(checkpoint_management.checkpoint_exists(second_name))
+    self.assertEqual([third_name, fourth_name],
+                     second_manager.checkpoints)
+    fifth_time = fourth_time + 3600. * 0.5
+    mock_time.time.return_value = fifth_time
+    fifth_name = os.path.join(directory, "ckpt-5")
+    second_manager.save()
+    self.assertEqual([fourth_name, fifth_name],
+                     second_manager.checkpoints)
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual(first_time, state.last_preserved_timestamp)
+    del second_manager
+    third_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory,
+        max_to_keep=2, keep_checkpoint_every_n_hours=1.5)
+    self.assertEqual(fifth_name, third_manager.latest_checkpoint)
+    mock_time.time.return_value += 10.
+    third_manager.save()
+    sixth_name = os.path.join(directory, "ckpt-6")
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual(fourth_time, state.last_preserved_timestamp)
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_name))
+    self.assertTrue(checkpoint_management.checkpoint_exists(fourth_name))
+    self.assertTrue(checkpoint_management.checkpoint_exists(fifth_name))
+    self.assertTrue(checkpoint_management.checkpoint_exists(sixth_name))
+    self.assertFalse(checkpoint_management.checkpoint_exists(second_name))
+    self.assertFalse(checkpoint_management.checkpoint_exists(third_name))
+    self.assertEqual([fifth_name, sixth_name],
+                     third_manager.checkpoints)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContinueFromUnmanaged(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "unusual_prefix")
+    checkpoint = util.Checkpoint()
+    first_path = checkpoint.save(prefix)
+    second_path = checkpoint.save(prefix)
+    del checkpoint
+    checkpoint = util.Checkpoint()
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2)
+    checkpoint.restore(manager.latest_checkpoint).run_restore_ops()
+    self.assertEqual(2, self.evaluate(checkpoint.save_counter))
+    third_path = manager.save()
+    self.assertEqual([third_path], manager.checkpoints)
+    fourth_path = manager.save()
+    self.assertEqual([third_path, fourth_path],
+                     manager.checkpoints)
+    fifth_path = manager.save()
+    self.assertEqual([fourth_path, fifth_path],
+                     manager.checkpoints)
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertFalse(checkpoint_management.checkpoint_exists(third_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(fifth_path))
+
+  @test_util.run_in_graph_and_eager_modes
+  @test.mock.patch.object(checkpoint_management, "time")
+  def testClockReset(self, mock_time):
+    directory = self.get_temp_dir()
+    mock_time.time.return_value = 10000.
+    checkpoint = util.Checkpoint()
+    first_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=1, keep_checkpoint_every_n_hours=1.)
+    first_path = first_manager.save()
+    mock_time.time.return_value += 3600.
+    second_path = first_manager.save()
+    mock_time.time.return_value += 3600.
+    third_path = first_manager.save()
+    self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertEqual([third_path], first_manager.checkpoints)
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual(13600., state.last_preserved_timestamp)
+    # Set the clock back in time
+    mock_time.time.return_value = 5000.
+    del first_manager
+    with test.mock.patch.object(logging, "warning") as mock_log:
+      second_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory, max_to_keep=1)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          "behind the last preserved checkpoint timestamp")
+    # We should err on the side of keeping checkpoints around when we're not
+    # sure whether they were preserved or not due to clock funkiness.
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    # We know about the existing checkpoints, but they'll never be deleted and
+    # so won't go in the CheckpointState proto on save.
+    self.assertEqual(third_path, second_manager.latest_checkpoint)
+    self.assertEqual([], second_manager.checkpoints)
+    mock_time.time.return_value += 10.
+    fourth_path = second_manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertEqual(fourth_path, second_manager.latest_checkpoint)
+    self.assertEqual([fourth_path], second_manager.checkpoints)
+    mock_time.time.return_value += 10.
+    fifth_path = second_manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertEqual([fifth_path], second_manager.checkpoints)
+    state = checkpoint_management.get_checkpoint_state(directory)
+    self.assertEqual(5000., state.last_preserved_timestamp)
+    self.assertEqual([5020.],
+                     state.all_model_checkpoint_timestamps)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomNumbering(self):
+    directory = self.get_temp_dir()
+    step = variables.Variable(0, dtype=dtypes.int64)
+    checkpoint = util.Checkpoint(step=step)
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2)
+    self.evaluate(step.initializer)
+    for i in range(5):
+      path = manager.save(checkpoint_number=step)
+      expected_suffix = "-%d" % (2 * i,)
+      if not path.endswith(expected_suffix):
+        self.fail("%s should have suffix %s" % (path, expected_suffix))
+      self.evaluate(step.assign_add(2))
+    self.assertEqual(5, self.evaluate(checkpoint.save_counter))
+    # Test regular integers
+    last_path = manager.save(checkpoint_number=32)
+    self.assertIn("-32", last_path)
+    self.assertEqual(last_path, manager.latest_checkpoint)
+    self.assertEqual(
+        last_path, checkpoint_management.latest_checkpoint(directory))
+    state = checkpoint_management.get_checkpoint_state(directory)
+    # Only the most recent two checkpoints are saved
+    self.assertEqual([path, last_path], state.all_model_checkpoint_paths)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpoint_state.proto b/tensorflow/python/training/checkpoint_state.proto
index 9172a5c..704f7fd 100644
--- a/tensorflow/python/training/checkpoint_state.proto
+++ b/tensorflow/python/training/checkpoint_state.proto
@@ -4,8 +4,6 @@
 option cc_enable_arenas = true;
 
 // Protocol buffer representing the checkpoint state.
-//
-// TODO(touts): Add other attributes as needed.
 message CheckpointState {
   // Path to the most-recent model checkpoint.
   string model_checkpoint_path = 1;
@@ -15,4 +13,10 @@
   // Note that the value of model_checkpoint_path should be the last item in
   // this list.
   repeated string all_model_checkpoint_paths = 2;
+  // Unix timestamps corresponding to all_model_checkpoint_paths, indicating
+  // when each checkpoint was created.
+  repeated double all_model_checkpoint_timestamps = 3;
+  // Unix timestamp indicating the creation time for the last preserved
+  // checkpoint.
+  double last_preserved_timestamp = 4;
 }
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 9b72b09..e611817 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,7 +29,7 @@
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
@@ -180,10 +180,10 @@
     tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
     ValueError: If missing variables in current graph.
   """
-  if distribute_lib.get_cross_tower_context():
+  if distribution_strategy_context.get_cross_tower_context():
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
-    distribute_lib.get_tower_context().merge_call(
+    distribution_strategy_context.get_tower_context().merge_call(
         _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
 
 
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index 8a289b3..d26932c 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -101,15 +101,26 @@
     srcs_version = "PY2AND3",
     deps = [
         ":base",
+        ":data_structures",
         ":tracking",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saveable_object",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -118,10 +129,7 @@
     name = "util_test",
     srcs = ["util_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
+    tags = ["notsan"],  # b/74395663
     deps = [
         ":base",
         ":tracking",
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index 66837ee..9189d8f 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -22,6 +22,7 @@
 import json
 import weakref
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -79,10 +80,6 @@
       self.wrapped_value.set_shape(shape)
     self._checkpoint_position = checkpoint_position
 
-  @property
-  def __class__(self):
-    return (self.wrapped_value.__class__, CheckpointInitialValue)
-
   def __getattr__(self, attr):
     try:
       return getattr(self.wrapped_value, attr)
@@ -97,14 +94,17 @@
 class PythonStringStateSaveable(saveable_object.SaveableObject):
   """Saves Python state in a checkpoint."""
 
-  def __init__(self, name, state_callback):
+  def __init__(self, name, state_callback, restore_callback=None):
     """Configure saving.
 
     Args:
       name: The checkpoint key to write to.
       state_callback: A function taking no arguments which returns a
         string. This function is run every time a checkpoint is written.
+      restore_callback: A function taking a Python string, used to restore
+        state. Optional; defaults to doing nothing.
     """
+    self._restore_callback = restore_callback
     if context.executing_eagerly():
       self._save_string = (
           lambda: constant_op.constant(state_callback(), dtype=dtypes.string))
@@ -117,9 +117,14 @@
     super(PythonStringStateSaveable, self).__init__(
         self._save_string, [spec], name)
 
+  def python_restore(self, restored_strings):
+    """Called to restore Python state."""
+    if self._restore_callback:
+      restored, = restored_strings
+      self._restore_callback(restored)
+
   def restore(self, restored_tensors, restored_shapes):
-    # TODO(allenl): Add a Python hook for state coming out of a checkpoint
-    # (currently PythonStringStateSaveable is write-only).
+    """Called to restore TensorFlow state (nothing to do)."""
     return control_flow_ops.no_op()
 
 
@@ -231,7 +236,7 @@
         with ops.device("/cpu:0"):
           # Run the restore itself on the CPU.
           value, = io_ops.restore_v2(
-              prefix=self._checkpoint.save_path,
+              prefix=self._checkpoint.save_path_tensor,
               tensor_names=[checkpoint_key],
               shape_and_slices=[""],
               dtypes=[base_type],
@@ -240,6 +245,76 @@
         value_tensors[serialized_tensor.name] = array_ops.identity(value)
       return value_tensors
 
+  def _gather_ops_or_named_saveables(self):
+    """Looks up or creates SaveableObjects which don't have cached ops."""
+    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    # Name saveables based on the name this object had when it was checkpointed.
+    named_saveables = {}
+    python_saveables = []
+    existing_restore_ops = []
+    for serialized_tensor in self.object_proto.attributes:
+      if context.executing_eagerly():
+        existing_op = None
+      else:
+        existing_op = self._checkpoint.restore_ops_by_name.get(
+            serialized_tensor.checkpoint_key, None)
+      if existing_op is not None:
+        existing_restore_ops.append(existing_op)
+        continue
+
+      # Only if we don't have cached ops for this SaveableObject, we'll see if
+      # the SaveableObject itself has been cached. If not, we'll make it, and
+      # either way we'll extract new ops from it (or if it has Python state to
+      # restore, we'll run that).
+      if self._checkpoint.saveable_object_cache is None:
+        # No SaveableObject caching when executing eagerly.
+        saveable = None
+      else:
+        # If we've already created and cached a SaveableObject for this
+        # attribute, we can re-use it to avoid re-creating some ops when graph
+        # building.
+        saveable_list = self._checkpoint.saveable_object_cache.get(
+            self.checkpointable, {}).get(serialized_tensor.name, (None,))
+        if len(saveable_list) == 1:
+          # Almost every attribute will have exactly one SaveableObject.
+          saveable, = saveable_list
+        else:
+          # Don't use cached SaveableObjects for partitioned variables, which is
+          # the only case where we'd have a list of SaveableObjects. Op caching
+          # will catch them.
+          saveable = None
+      if saveable is not None:
+        # The name of this attribute has changed, so we need to re-generate
+        # the SaveableObject.
+        if serialized_tensor.checkpoint_key not in saveable.name:
+          saveable = None
+          del self._checkpoint.saveable_object_cache[self.checkpointable]
+          break
+      if saveable is None:
+        # If there was no cached SaveableObject, we should check if the Python
+        # object has the attribute.
+        saveable_factory = saveables.get(serialized_tensor.name, None)
+        if saveable_factory is None:
+          # Purposefully does not throw an exception if attributes have been
+          # added or deleted. Stores unused attributes so an exception can be
+          # raised if the user decides to check that everything in the
+          # checkpoint was loaded.
+          self._checkpoint.unused_attributes.setdefault(
+              self.checkpointable, []).append(serialized_tensor.name)
+          continue
+        if callable(saveable_factory):
+          saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
+        else:
+          saveable = saveable_factory
+        if self._checkpoint.saveable_object_cache is not None:
+          self._checkpoint.saveable_object_cache.setdefault(
+              self.checkpointable, {})[serialized_tensor.name] = [saveable]
+      if isinstance(saveable, PythonStringStateSaveable):
+        python_saveables.append(saveable)
+      else:
+        named_saveables[serialized_tensor.checkpoint_key] = saveable
+    return existing_restore_ops, named_saveables, python_saveables
+
   def restore_ops(self):
     """Create or fetch restore ops for this object's attributes.
 
@@ -250,32 +325,19 @@
       A list of operations when graph building, or an empty list when executing
       eagerly.
     """
-    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
-    # Name saveables based on the name this object had when it was checkpointed.
-    named_saveables = {}
-    restore_ops = []
-    building_graph = not context.executing_eagerly()
-    for serialized_tensor in self.object_proto.attributes:
-      saveable_factory = saveables.get(serialized_tensor.name, None)
-      if saveable_factory is None:
-        # Purposefully does not throw an exception if attributes have been added
-        # or deleted. Stores unused attributes so an exception can be raised if
-        # the user decides to check that everything in the checkpoint was
-        # loaded.
-        self._checkpoint.unused_attributes.setdefault(
-            self.checkpointable, []).append(serialized_tensor.name)
-        continue
-      if building_graph:
-        existing_ops = self._checkpoint.restore_ops_by_name.get(
-            serialized_tensor.name, None)
-      else:
-        existing_ops = None
-      if existing_ops is None:
-        if callable(saveable_factory):
-          saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
-        else:
-          saveable = saveable_factory
-        named_saveables[serialized_tensor.checkpoint_key] = saveable
+    (restore_ops,
+     named_saveables,
+     python_saveables) = self._gather_ops_or_named_saveables()
+
+    # Eagerly run restorations for Python state.
+    reader = pywrap_tensorflow.NewCheckpointReader(
+        self._checkpoint.save_path_string)
+    for saveable in python_saveables:
+      spec_names = [spec.name for spec in saveable.specs]
+      saveable.python_restore(
+          [reader.get_tensor(name) for name in spec_names])
+
+    # If we have new SaveableObjects, extract and cache restore ops.
     if named_saveables:
       validated_saveables = (
           self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
@@ -285,7 +347,7 @@
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (named_saveables.keys(), validated_names))
       all_tensors = self._checkpoint.builder.bulk_restore(
-          filename_tensor=self._checkpoint.save_path,
+          filename_tensor=self._checkpoint.save_path_tensor,
           saveables=validated_saveables, preferred_shard=-1,
           restore_sequentially=False)
       saveable_index = 0
@@ -295,7 +357,7 @@
             saveable_index:saveable_index + num_specs]
         saveable_index += num_specs
         restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
-        if building_graph:
+        if not context.executing_eagerly():
           assert saveable.name not in self._checkpoint.restore_ops_by_name
           self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
           restore_ops.append(restore_op)
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 3cdaedc..f49ed5c 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -19,6 +19,7 @@
 
 import abc
 import collections
+import os
 import weakref
 
 from tensorflow.core.protobuf import checkpointable_object_graph_pb2
@@ -34,8 +35,9 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saveable_object as saveable_object_lib
 from tensorflow.python.training import saver as saver_lib
@@ -66,16 +68,25 @@
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
-  def __init__(self, object_graph_proto, save_path, dtype_map=None):
+  def __init__(self, object_graph_proto, save_path, save_path_tensor,
+               restore_op_cache, saveable_object_cache):
     """Specify the checkpoint being loaded.
 
     Args:
       object_graph_proto: The CheckpointableObjectGraph protocol buffer
         associated with this checkpoint.
-      save_path: A string `Tensor`. The path to the checkpoint, as returned by
+      save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
-      dtype_map: When executing eagerly, specifies dtypes for creating slot
-        variables. None when graph building.
+      save_path_tensor: A string `Tensor` which contains or will be fed the save
+        path.
+      restore_op_cache: A dictionary shared between
+        `_CheckpointRestoreCoordinator`s for the same Python objects, used to
+        look up restore ops by name to avoid re-creating them across multiple
+        `restore()` calls.
+      saveable_object_cache: A mapping of checkpointable objects -> attribute
+        names -> list(`SaveableObject`s), used when `SaveableObjects` must be
+        referenced every restore (e.g. for Python state); otherwise they would
+        create their own ops every restore.
     """
     self.builder = saver_lib.BulkSaverBuilder()
     self.object_graph_proto = object_graph_proto
@@ -95,12 +106,17 @@
     # loading). Used to make status assertions fail when loading checkpoints
     # that don't quite match.
     self.all_python_objects = _ObjectIdentityWeakSet()
-    self.save_path = save_path
-    self.dtype_map = dtype_map
+    self.save_path_tensor = save_path_tensor
+    self.save_path_string = save_path
+    self.dtype_map = pywrap_tensorflow.NewCheckpointReader(
+        save_path).get_variable_to_dtype_map()
+    # A NewCheckpointReader for the most recent checkpoint, for streaming Python
+    # state restoration.
     # When graph building, contains a list of ops to run to restore objects from
     # this checkpoint.
     self.restore_ops = []
-    self.restore_ops_by_name = {}
+    self.restore_ops_by_name = restore_op_cache
+    self.saveable_object_cache = saveable_object_cache
     self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
@@ -225,10 +241,11 @@
       def initial_value():
         return initializer(
             shape_object.as_list(), dtype=dtype, partition_info=partition_info)
-    return resource_variable_ops.ResourceVariable(
+    return variables.Variable(
         initial_value=initial_value,
         name=name,
         dtype=variable_dtype,
+        use_resource=True,
         **kwargs
     )
 
@@ -1100,7 +1117,7 @@
 
 def _copy_saver_with_new_var_list(old_saver, new_var_list):
   """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list)
+  new_saver = saver_lib.Saver(var_list=new_var_list, max_to_keep=None)
   # TODO(allenl): Move to copying functionality to Saver?
   # pylint: disable=protected-access
   new_saver._last_checkpoints = old_saver._last_checkpoints
@@ -1150,16 +1167,15 @@
     self._last_save_object_graph = None
     self._last_save_saver = None
 
-    # Op caching for restore
-    self._last_restore_object_graph = None
-    self._last_restore_checkpoint = None
+    # Op caching for restore, shared between _CheckpointRestoreCoordinators
+    self._restore_op_cache = {}
 
     if context.executing_eagerly():
       # SaveableObjects are always recreated when executing eagerly.
       self._saveable_object_cache = None
     else:
-      # Maps Checkpointable objects -> attribute names -> SaveableObjects, to
-      # avoid re-creating SaveableObjects when graph building.
+      # Maps Checkpointable objects -> attribute names -> list(SaveableObjects),
+      # to avoid re-creating SaveableObjects when graph building.
       self._saveable_object_cache = _ObjectIdentityWeakKeyDictionary()
 
   @property
@@ -1226,7 +1242,8 @@
         self._last_save_saver = _copy_saver_with_new_var_list(
             old_saver=self._last_save_saver, new_var_list=named_variables)
       else:
-        self._last_save_saver = saver_lib.Saver(var_list=named_variables)
+        self._last_save_saver = saver_lib.Saver(
+            var_list=named_variables, max_to_keep=None)
       self._last_save_object_graph = graph_proto
     with ops.device("/cpu:0"):
       save_path = self._last_save_saver.save(
@@ -1234,6 +1251,7 @@
               session=session, feed_additions=feed_additions),
           save_path=file_prefix,
           write_meta_graph=False,
+          write_state=False,
           global_step=checkpoint_number)
     return save_path
 
@@ -1335,22 +1353,12 @@
     object_graph_proto = (
         checkpointable_object_graph_pb2.CheckpointableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
-    if graph_building and object_graph_proto == self._last_restore_object_graph:
-      checkpoint = self._last_restore_checkpoint
-    else:
-      checkpoint = _CheckpointRestoreCoordinator(
-          object_graph_proto=object_graph_proto,
-          save_path=file_prefix_tensor,
-          dtype_map=dtype_map)
-      if graph_building:
-        if self._last_restore_object_graph is not None:
-          raise NotImplementedError(
-              "Using a single Saver to restore different object graphs is not "
-              "currently supported when graph building. Use a different Saver "
-              "for each object graph (restore ops will be duplicated), or "
-              "file a feature request if this limitation bothers you.")
-        self._last_restore_checkpoint = checkpoint
-        self._last_restore_object_graph = object_graph_proto
+    checkpoint = _CheckpointRestoreCoordinator(
+        object_graph_proto=object_graph_proto,
+        save_path=save_path,
+        save_path_tensor=file_prefix_tensor,
+        restore_op_cache=self._restore_op_cache,
+        saveable_object_cache=self._saveable_object_cache)
     base._CheckpointPosition(  # pylint: disable=protected-access
         checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
     load_status = CheckpointLoadStatus(
@@ -1486,6 +1494,32 @@
             add_variable(self, name="save_counter", initializer=0,
                          dtype=dtypes.int64))
 
+  def write(self, file_prefix, session=None):
+    """Writes a training checkpoint.
+
+    The checkpoint includes variables created by this object and any
+    checkpointable objects it depends on at the time `Checkpoint.write()` is
+    called.
+
+    `write` does not number checkpoints, increment `save_counter`, or update the
+    metadata used by `tf.train.latest_checkpoint`. It is primarily intended for
+    use by higher level checkpoint management utilities. `save` provides a very
+    basic implementation of these features.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix).
+      session: The session to evaluate variables in. Ignored when executing
+        eagerly. If not provided when graph building, the default session is
+        used.
+
+    Returns:
+      The full path to the checkpoint (i.e. `file_prefix`).
+    """
+    return self._saver.save(
+        file_prefix=file_prefix,
+        session=session)
+
   @property
   def save_counter(self):
     """An integer variable which starts at zero and is incremented on save.
@@ -1499,12 +1533,19 @@
     return self._save_counter
 
   def save(self, file_prefix, session=None):
-    """Save a training checkpoint.
+    """Saves a training checkpoint and provides basic checkpoint management.
 
     The saved checkpoint includes variables created by this object and any
     checkpointable objects it depends on at the time `Checkpoint.save()` is
     called.
 
+    `save` is a basic convenience wrapper around the `write` method,
+    sequentially numbering checkpoints using `save_counter` and updating the
+    metadata used by `tf.train.latest_checkpoint`. More advanced checkpoint
+    management, for example garbage collection and custom numbering, may be
+    provided by other utilities which also wrap `write`
+    (`tf.contrib.checkpoint.CheckpointManager` for example).
+
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
         (/path/to/directory/and_a_prefix). Names are generated based on this
@@ -1527,15 +1568,20 @@
         session.run(self.save_counter.initializer)
     if not graph_building or self._save_assign_op is None:
       with ops.colocate_with(self.save_counter):
-        assign_op = self.save_counter.assign_add(1, read_value=False)
+        assign_op = self.save_counter.assign_add(1, read_value=True)
       if graph_building:
-        self._save_assign_op = assign_op
+        self._save_assign_op = data_structures.NoDependency(assign_op)
     if graph_building:
-      session.run(self._save_assign_op)
-    return self._saver.save(
-        file_prefix=file_prefix,
-        checkpoint_number=self.save_counter,
-        session=session)
+      checkpoint_number = session.run(self._save_assign_op)
+    else:
+      checkpoint_number = assign_op.numpy()
+    file_path = self.write("%s-%d" % (file_prefix, checkpoint_number),
+                           session=session)
+    checkpoint_management.update_checkpoint_state(
+        save_dir=os.path.dirname(file_prefix),
+        model_checkpoint_path=file_path,
+        all_model_checkpoint_paths=[file_path])
+    return file_path
 
   def restore(self, save_path):
     """Restore a training checkpoint.
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 5506e6b..cac293e 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -522,7 +522,6 @@
     # Does create garbage when executing eagerly due to ops.Graph() creation.
     num_training_steps = 10
     checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     for training_continuation in range(3):
       with ops.Graph().as_default(), self.test_session(
           graph=ops.get_default_graph()), test_util.device(use_gpu=True):
@@ -531,9 +530,9 @@
         root = checkpointable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             global_step=training_util.get_or_create_global_step())
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            checkpoint_directory)
-        status = root.restore(save_path=checkpoint_path)
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=1)
+        status = root.restore(save_path=manager.latest_checkpoint)
         input_value = constant_op.constant([[3.]])
         train_fn = functools.partial(
             optimizer.minimize,
@@ -544,12 +543,26 @@
         status.initialize_or_restore()
         for _ in range(num_training_steps):
           train_fn()
-        root.save(file_prefix=checkpoint_prefix)
+        manager.save()
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          self.evaluate(root.global_step))
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomNumbering(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "ckpt")
+    step = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
+    checkpoint = checkpointable_utils.Checkpoint(step=step)
+    self.evaluate(step.initializer)
+    for i in range(5):
+      path = checkpoint.write("%s-%d" % (prefix, self.evaluate(step)))
+      expected_suffix = "-%d" % (2 * i,)
+      if not path.endswith(expected_suffix):
+        self.fail("%s should have suffix %s" % (path, expected_suffix))
+      self.evaluate(step.assign_add(2))
+
   # pylint: disable=cell-var-from-loop
   @test_util.run_in_graph_and_eager_modes
   def testWithDefun(self):
@@ -996,7 +1009,8 @@
         self.assertEqual(before_ops, graph.get_operations())
 
   @test_util.run_in_graph_and_eager_modes
-  def testCheckpointCleanup(self):
+  def testCheckpointState(self):
+    # No checkpoints are deleted by default
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     obj = tracking.Checkpointable()
@@ -1006,7 +1020,7 @@
     for _ in range(10):
       saver.save(checkpoint_prefix)
     expected_filenames = ["checkpoint"]
-    for checkpoint_number in range(6, 11):
+    for checkpoint_number in range(1, 11):
       expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
       expected_filenames.append(
           "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
@@ -1016,7 +1030,7 @@
         os.listdir(checkpoint_directory))
 
   @test_util.run_in_graph_and_eager_modes
-  def testCheckpointCleanupChangingVarList(self):
+  def testCheckpointStateChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     obj = tracking.Checkpointable()
@@ -1032,8 +1046,8 @@
       looped_variables.append(new_variable)
     expected_filenames = ["checkpoint"]
     # We've copied the saver each time, but checkpoint management should still
-    # be consistent.
-    for checkpoint_number in range(6, 11):
+    # be consistent. Nothing gets deleted.
+    for checkpoint_number in range(1, 11):
       expected_filenames.append("ckpt-%d.index" % (checkpoint_number,))
       expected_filenames.append(
           "ckpt-%d.data-00000-of-00001" % (checkpoint_number,))
@@ -1041,6 +1055,15 @@
         self,
         expected_filenames,
         os.listdir(checkpoint_directory))
+    self.assertEqual(
+        checkpoint_prefix + "-10",
+        checkpoint_management.latest_checkpoint(checkpoint_directory))
+    # The checkpoint list only contains the most recent checkpoint, but they're
+    # all on disk. This means we won't eventually run into proto size limits.
+    self.assertEqual(
+        [checkpoint_prefix + "-10"],
+        (checkpoint_management.get_checkpoint_state(checkpoint_directory)
+         .all_model_checkpoint_paths))
     for v in looped_variables:
       self.evaluate(v.assign(314))
     checkpoint.restore(checkpoint_prefix + "-6").run_restore_ops()
@@ -1050,16 +1073,11 @@
     self.assertEqual(5, self.evaluate(checkpoint.var_5))
     self.assertEqual(1, self.evaluate(checkpoint.var_1))
     self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    if context.executing_eagerly():
-      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-      self.assertEqual(9, self.evaluate(checkpoint.var_9))
-      self.assertEqual(8, self.evaluate(checkpoint.var_8))
-      self.assertEqual(1, self.evaluate(checkpoint.var_1))
-      self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    else:
-      # Restoring into modified graphs is an error while graph building.
-      with self.assertRaises(NotImplementedError):
-        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+    checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+    self.assertEqual(9, self.evaluate(checkpoint.var_9))
+    self.assertEqual(8, self.evaluate(checkpoint.var_8))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
 
   def testManyRestoresGraph(self):
     """Restores after the first should not modify the graph."""
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 170d683..20e0315 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -21,6 +21,7 @@
 import threading
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -30,71 +31,11 @@
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_util
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import nest
 
 
 # ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# tower or cross-tower context for a particular distribution strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, tower):
-    self.distribution_strategy = dist
-    self.cross_tower_context = cross
-    self.tower_context = tower
-
-
-class _CrossTowerThreadMode(_ThreadMode):
-
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
-
-
-class _InTowerThreadMode(_ThreadMode):
-
-  def __init__(self, tower_ctx):
-    _ThreadMode.__init__(
-        self, tower_ctx.distribution_strategy, None, tower_ctx)
-
-
-_per_thread_mode = threading.local()
-
-
-def _push_per_thread_mode(context):
-  if not hasattr(_per_thread_mode, "stack"):
-    _per_thread_mode.stack = []
-  _per_thread_mode.stack.append(context)
-
-
-def _pop_per_thread_mode():
-  _per_thread_mode.stack.pop(-1)
-
-
-class _DefaultTowerThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    # _default_distribution_strategy and _default_tower_context are
-    # defined at the bottom of this file.
-    _ThreadMode.__init__(
-        self, _default_distribution_strategy, None, _default_tower_context)
-
-
-def _get_per_thread_mode():
-  try:
-    return _per_thread_mode.stack[-1]
-  except (AttributeError, IndexError):
-    # _default_tower_mode is defined at the bottom of this file.
-    return _default_tower_mode
-
-
-# ------------------------------------------------------------------------------
 # Context tracking whether in a distribution.update() or .update_non_slot()
 # call.
 
@@ -127,96 +68,6 @@
 
 
 # ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-def get_tower_context():
-  """Returns the current TowerContext or None if in a cross-tower context.
-
-  Note that execution:
-  1. starts in the default (single-tower) tower context (this function
-     will return the default TowerContext object);
-  2. switches to cross-tower context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context (and again
-     this function will return None).
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context, in a tower context you should use the
-  `TowerContext` API instead.
-
-  Returns:
-    The current `TowerContext` object when in a tower context scope, else None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().tower_context
-
-
-def get_cross_tower_context():
-  """Returns the current DistributionStrategy if in a cross-tower context.
-
-  Note that execution:
-  1. starts in the default (single-tower) tower context;
-  2. switches to cross-tower context when entering a
-     `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context.
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context.
-
-  Returns:
-    Returns the current `DistributionStrategy` object in a cross-tower
-    context, or None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().cross_tower_context
-
-
-def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
-
-  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
-  instead when possible.
-
-  Returns:
-    A `DistributionStrategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-tower) `DistributionStrategy` object.
-  """
-  return _get_per_thread_mode().distribution_strategy
-
-
-def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
-
-  Returns:
-    True if inside a `with distribution_strategy.scope():`.
-  """
-  return get_distribution_strategy() is not _default_distribution_strategy
-
-
-# ------------------------------------------------------------------------------
 # Public utility functions.
 
 
@@ -238,7 +89,8 @@
   if context.cross_tower_context is distribution_strategy: return
   # We have an error to report, figure out the right message.
   if context.distribution_strategy is not distribution_strategy:
-    if context.distribution_strategy is _default_distribution_strategy:
+    if (context.distribution_strategy is
+        distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
       raise RuntimeError(
           'Need to be inside "with distribution_strategy.scope()" for %s' %
           (distribution_strategy,))
@@ -271,7 +123,8 @@
   context = _get_per_thread_mode()
   if context.distribution_strategy is distribution_strategy: return
   # We have an error to report, figure out the right message.
-  if context.distribution_strategy is _default_distribution_strategy:
+  if (context.distribution_strategy is
+      distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
     raise RuntimeError(
         'Need to be inside "with distribution_strategy.scope()" for %s' %
         (distribution_strategy,))
@@ -294,7 +147,8 @@
                var_creator_scope,
                var_scope=None,
                default_device=None):
-    self._context = _CrossTowerThreadMode(distribution_strategy)
+    self._context = distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
+        distribution_strategy)
     self._var_creator_scope = var_creator_scope
     self._var_scope = var_scope
     if default_device:
@@ -587,7 +441,7 @@
     Returns:
       A context manager.
     """
-    if has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       _require_cross_tower_context(self)
       return _SameScopeAgainContext(self)
 
@@ -727,6 +581,90 @@
   def _broadcast(self, tensor, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
+  def initialize(self):
+    """Any initialization to be done before running any computations.
+
+    In eager mode, it executes any initialization as a side effect.
+    In graph mode, it creates the initialization ops and returns them.
+
+    For example, TPU initialize_system ops.
+
+    Returns:
+      In eager mode, returns `None`.
+      In graph mode, a list of ops to execute. Empty list if nothing to be done.
+    """
+    if eager_context.executing_eagerly():
+      return
+    else:
+      return []
+
+  def finalize(self):
+    """Any final actions to be done at the end of all computations.
+
+    In eager mode, it executes any finalize actions as a side effect.
+    In graph mode, it creates the finalize ops and returns them.
+
+    For example, TPU shutdown ops.
+
+    Returns:
+      In eager mode, returns `None`.
+      In graph mode, a list of ops to execute. Empty list if nothing to be done.
+    """
+    if eager_context.executing_eagerly():
+      return
+    else:
+      return []
+
+  def run_steps_on_dataset(self, fn, iterator, iterations=1,
+                           initial_loop_values=None):
+    """Run `fn` with input from `iterator` for `iterations` times.
+
+    This method can be used to run a step function for training a number of
+    times using input from a dataset.
+
+    Args:
+      fn: function to run using this distribution strategy. The function must
+        have the following signature: def fn(context, *inputs).
+        `context` is an instance of `MultiStepContext` that will be passed when
+        `fn` is run. `context` can be used to specify the outputs to be returned
+        from `fn` by calling `context.set_last_step_output`. It can also be used
+        to capture non tensor outputs by `context.set_non_tensor_output`.
+        See `MultiStepContext` documentation for more information.
+        `inputs` will have same type/structure as `iterator.get_next()`. If the
+        `iterator.get_next()` returns a tuple say `return x, y` then whose will
+        be unpacked and passed to the `step_fn`; and step_fn signature would
+        look like `def step_fn(context, x, y)`. If the iterator returns a single
+        value say `return x` then the value is passed as is; the step_fn
+        signature would look like `def step_fn(context, x)`.
+        Typically, `fn` will use `call_for_each_tower` method of the strategy
+        to distribute the computation over multiple towers.
+      iterator: Iterator of a dataset that represents the input for `fn`. The
+        caller is responsible for initializing the iterator as needed.
+      iterations: (Optional) Number of iterations that `fn` should be run.
+        Defaults to 1.
+      initial_loop_values: (Optional) Initial values to be passed into the
+        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
+        initial_loop_values argument when we have a mechanism to infer the
+        outputs of `fn`.
+
+    Returns:
+      Returns the `MultiStepContext` object which has the following properties,
+      among other things:
+        - run_op: An op that runs `fn` `iterations` times.
+        - last_step_outputs: A dictionary containing tensors set using
+        `context.set_last_step_output`. Evaluating this returns the value of
+        the tensors after the last iteration.
+        - non_tensor_outputs: A dictionatry containing anything that was set by
+          `fn` by calling `context.set_non_tensor_output`.
+    """
+    _require_cross_tower_context(self)
+    return self._run_steps_on_dataset(fn, iterator, iterations,
+                                      initial_loop_values)
+
+  def _run_steps_on_dataset(self, fn, iterator, iterations,
+                            initial_loop_values):
+    raise NotImplementedError("must be implemented in descendants")
+
   def call_for_each_tower(self, fn, *args, **kwargs):
     """Run `fn` once per tower.
 
@@ -784,7 +722,7 @@
 
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
-        are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}.
+        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
       value: A per-device value with one value per tower.
       destinations: An optional mirrored variable, a device string,
         list of device strings. The return value will be copied to all
@@ -813,7 +751,7 @@
 
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
-        are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}.
+        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
       value_destination_pairs: A sequence of (value, destinations)
         pairs. See `reduce()` for a description.
 
@@ -997,9 +935,37 @@
   def _worker_device_index(self):
     raise NotImplementedError("must be implemented in descendants")
 
-  def configure(self, session_config=None):
-    """Find the best configuration given a tensorflow session config."""
-    del session_config
+  @property
+  def between_graph(self):
+    """Whether the strategy uses between-graph replication or not.
+
+      This is expected to return a constant value that will not be changed
+      throughout its life cycle.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the strategy class."""
+    del session_config, cluster_spec, task_type, task_id
+
+  @property
+  def should_init(self):
+    """Whether initialization is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_checkpoint(self):
+    """Whether checkpointing is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_save_summary(self):
+    """Whether saving summaries is needed."""
+    raise NotImplementedError("must be implemented in descendants")
 
 
 # A note about the difference between the context managers
@@ -1026,7 +992,8 @@
 
   def __init__(self, distribution_strategy, tower_id):
     self._distribution_strategy = distribution_strategy
-    self._thread_context = _InTowerThreadMode(self)
+    self._thread_context = distribution_strategy_context._InTowerThreadMode(  # pylint: disable=protected-access
+        self)
     self._tower_id = tower_id
 
   def __enter__(self):
@@ -1069,7 +1036,8 @@
   def _merge_call(self, merge_fn, *args, **kwargs):
     """Default implementation for single tower."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        _CrossTowerThreadMode(self._distribution_strategy))
+        distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
+            self._distribution_strategy))
     try:
       return merge_fn(self._distribution_strategy, *args, **kwargs)
     finally:
@@ -1116,7 +1084,7 @@
 
   def scope(self):
     """Context manager setting a variable creator and `self` as current."""
-    if has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       raise RuntimeError("Must not nest DistributionStrategy scopes.")
 
     def creator(next_creator, *args, **kwargs):
@@ -1197,6 +1165,7 @@
     raise RuntimeError("worker_device_index() method unsupported by "
                        "_DefaultDistributionStrategy.")
 
+
 # ------------------------------------------------------------------------------
 # Common operations
 
@@ -1212,20 +1181,11 @@
   def merge_fn(dist, vm):
     return dist.group(dist.update(vm, update))
 
-  tower_context = get_tower_context()
+  tower_context = distribution_strategy_context.get_tower_context()
   return tower_context.merge_call(merge_fn, v)
 
 
 # ------------------------------------------------------------------------------
-# Singletons
-
-_default_distribution_strategy = _DefaultDistributionStrategy()
-_default_tower_context = TowerContext(
-    _default_distribution_strategy, tower_id=0)
-_default_tower_mode = _DefaultTowerThreadMode()
-
-
-# ------------------------------------------------------------------------------
 # We haven't yet implemented deserialization for DistributedVariables.
 # So here we catch any attempts to deserialize variables
 # when using distribution strategies.
@@ -1234,7 +1194,7 @@
 
 
 def _from_proto_fn(v, import_scope=None):
-  if has_distribution_strategy():
+  if distribution_strategy_context.has_distribution_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using"
         "distributed strategies.")
@@ -1243,3 +1203,10 @@
 
 resource_variable_ops._from_proto_fn = _from_proto_fn
 # pylint: enable=protected-access
+
+
+#-------------------------------------------------------------------------------
+# Shorthand for some methods from distribution_strategy_context.
+_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
+_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
+_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
index 694145e..f03bd39 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/training/distribute_test.py
@@ -21,6 +21,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training import distribute
+from tensorflow.python.training import distribution_strategy_context
 
 
 class _TestTowerContext(distribute.TowerContext):
@@ -49,12 +50,12 @@
 
 
 def _assert_in_default_state(t):
-  t.assertIs(distribute._default_tower_context,
-             distribute.get_tower_context())
-  t.assertIs(None, distribute.get_cross_tower_context())
-  t.assertIs(distribute._default_distribution_strategy,
-             distribute.get_distribution_strategy())
-  t.assertFalse(distribute.has_distribution_strategy())
+  t.assertIs(distribution_strategy_context._get_default_tower_context(),
+             distribution_strategy_context.get_tower_context())
+  t.assertIs(None, distribution_strategy_context.get_cross_tower_context())
+  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
+             distribution_strategy_context.get_distribution_strategy())
+  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
 
 
 class TestStrategyTest(test.TestCase):
@@ -64,11 +65,13 @@
     dist = _TestStrategy()
 
     def run_fn():
-      tower_context = distribute.get_tower_context()
+      tower_context = distribution_strategy_context.get_tower_context()
       self.assertTrue(tower_context is not None)
-      self.assertIs(None, distribute.get_cross_tower_context())
-      self.assertTrue(distribute.has_distribution_strategy())
-      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertIs(None,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
       self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
@@ -86,10 +89,12 @@
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, distribute.get_tower_context())
-      self.assertIs(dist, distribute.get_cross_tower_context())
-      self.assertTrue(distribute.has_distribution_strategy())
-      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertIs(None, distribution_strategy_context.get_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
@@ -120,15 +125,21 @@
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(distribute._default_distribution_strategy, dist)
-      self.assertIs(None, distribute.get_tower_context())
-      self.assertIs(dist, distribute.get_cross_tower_context())
-      self.assertIs(dist, distribute.get_distribution_strategy())
-      self.assertFalse(distribute.has_distribution_strategy())
+      self.assertIs(
+          distribution_strategy_context._get_default_distribution_strategy(),
+          dist)
+      self.assertIs(None, distribution_strategy_context.get_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
+      self.assertFalse(
+          distribution_strategy_context.has_distribution_strategy())
       return "foo_" + s
 
-    tower_ctx = distribute.get_tower_context()
-    self.assertIs(distribute._default_tower_context, tower_ctx)
+    tower_ctx = distribution_strategy_context.get_tower_context()
+    self.assertIs(distribution_strategy_context._get_default_tower_context(),
+                  tower_ctx)
     self.assertEqual("foo_bar", tower_ctx.merge_call(merge_fn, "bar"))
     _assert_in_default_state(self)
 
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
new file mode 100644
index 0000000..998b5c3
--- /dev/null
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -0,0 +1,203 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to get distribution strategy related contexts."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# There is a circular dependency between this and `distribute` module. So we
+# load it lazily to workaround this.
+distribute_lib = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.training.distribute")
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# tower or cross-tower context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, tower):
+    self.distribution_strategy = dist
+    self.cross_tower_context = cross
+    self.tower_context = tower
+
+
+class _CrossTowerThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InTowerThreadMode(_ThreadMode):
+
+  def __init__(self, tower_ctx):
+    _ThreadMode.__init__(
+        self, tower_ctx.distribution_strategy, None, tower_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultTowerThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+                         _get_default_tower_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_tower_mode()
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+def get_tower_context():
+  """Returns the current TowerContext or None if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context (this function
+     will return the default TowerContext object);
+  2. switches to cross-tower context (in which case this will return
+     None) when entering a `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context (and again
+     this function will return None).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context, in a tower context you should use the
+  `TowerContext` API instead.
+
+  Returns:
+    The current `TowerContext` object when in a tower context scope, else None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().tower_context
+
+
+def get_cross_tower_context():
+  """Returns the current DistributionStrategy if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context;
+  2. switches to cross-tower context when entering a
+     `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context.
+
+  Returns:
+    Returns the current `DistributionStrategy` object in a cross-tower
+    context, or None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().cross_tower_context
+
+
+def get_distribution_strategy():
+  """Returns the current `DistributionStrategy` object.
+
+  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
+  instead when possible.
+
+  Returns:
+    A `DistributionStrategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-tower) `DistributionStrategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+def has_distribution_strategy():
+  """Return if there is a current non-default `DistributionStrategy`.
+
+  Returns:
+    True if inside a `with distribution_strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _get_default_distribution_strategy()
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no distribution strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "distribution_strategy": None,
+    "tower_context": None,
+    "tower_mode": None
+}
+
+
+def _get_default_distribution_strategy():
+  if _defaults["distribution_strategy"] is None:
+    _defaults["distribution_strategy"] = (
+        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
+  return _defaults["distribution_strategy"]
+
+
+def _get_default_tower_context():
+  if _defaults["tower_context"] is None:
+    _defaults["tower_context"] = distribute_lib.TowerContext(
+        _get_default_distribution_strategy(), tower_id=0)
+  return _defaults["tower_context"]
+
+
+def _get_default_tower_mode():
+  if _defaults["tower_mode"] is None:
+    _defaults["tower_mode"] = _DefaultTowerThreadMode()
+  return _defaults["tower_mode"]
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 4fa081f..832c10d 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -86,7 +86,7 @@
 
     if initial_accumulator_value < 0.0:
       raise ValueError(
-          "initial_accumulator_value %f needs to be be positive or zero" %
+          "initial_accumulator_value %f needs to be positive or zero" %
           initial_accumulator_value)
     if learning_rate_power > 0.0:
       raise ValueError("learning_rate_power %f needs to be negative or zero" %
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index caa2658..0d6207f 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -15,7 +15,8 @@
 
 """Input pipeline.
 
-Please see the @{$reading_data$reading data how-to}
+Please see the [reading data
+how-to](https://tensorflow.org/api_guides/python/reading_data)
 for context.
 """
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 7b06bff..c077630 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -25,6 +25,7 @@
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -284,6 +285,63 @@
         resources.initialize_resources(resources.local_resources()))
 
 
+def _create_monitored_session_with_worker_context(worker_context,  # pylint: disable=missing-docstring
+                                                  scaffold,
+                                                  checkpoint_dir=None,
+                                                  hooks=None,
+                                                  chief_only_hooks=None,
+                                                  save_checkpoint_secs=None,
+                                                  save_summaries_steps=None,
+                                                  save_summaries_secs=None,
+                                                  config=None,
+                                                  stop_grace_period_secs=120,
+                                                  log_step_count_steps=100,
+                                                  max_wait_secs=7200,
+                                                  save_checkpoint_steps=None,
+                                                  summary_dir=None):
+  all_hooks = []
+  if hooks:
+    all_hooks.extend(hooks)
+  if chief_only_hooks and worker_context.is_chief:
+    all_hooks.extend(chief_only_hooks)
+
+  summary_dir = summary_dir or checkpoint_dir
+  if summary_dir and worker_context.should_save_summary:
+    if log_step_count_steps and log_step_count_steps > 0:
+      all_hooks.append(
+          basic_session_run_hooks.StepCounterHook(
+              output_dir=summary_dir, every_n_steps=log_step_count_steps))
+
+    if (save_summaries_steps and save_summaries_steps > 0) or (
+        save_summaries_secs and save_summaries_secs > 0):
+      all_hooks.append(
+          basic_session_run_hooks.SummarySaverHook(
+              scaffold=scaffold,
+              save_steps=save_summaries_steps,
+              save_secs=save_summaries_secs,
+              output_dir=summary_dir))
+
+  if checkpoint_dir and worker_context.should_checkpoint:
+    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
+        save_checkpoint_steps and save_checkpoint_steps > 0):
+      all_hooks.append(
+          basic_session_run_hooks.CheckpointSaverHook(
+              checkpoint_dir,
+              save_steps=save_checkpoint_steps,
+              save_secs=save_checkpoint_secs,
+              scaffold=scaffold))
+
+  session_creator = worker_context.session_creator(
+      scaffold,
+      config=config,
+      checkpoint_dir=checkpoint_dir,
+      max_wait_secs=max_wait_secs)
+  return MonitoredSession(
+      session_creator=session_creator,
+      hooks=all_hooks,
+      stop_grace_period_secs=stop_grace_period_secs)
+
+
 @tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
@@ -373,14 +431,35 @@
     save_checkpoint_steps = None
 
   scaffold = scaffold or Scaffold()
+  worker_context = distribute_coordinator_context.get_current_worker_context()
+
+  if worker_context:
+    return _create_monitored_session_with_worker_context(
+        worker_context,
+        scaffold,
+        checkpoint_dir=checkpoint_dir,
+        hooks=hooks,
+        chief_only_hooks=chief_only_hooks,
+        save_checkpoint_secs=save_checkpoint_secs,
+        save_summaries_steps=save_summaries_steps,
+        save_summaries_secs=save_summaries_secs,
+        config=config,
+        stop_grace_period_secs=stop_grace_period_secs,
+        log_step_count_steps=log_step_count_steps,
+        max_wait_secs=max_wait_secs,
+        save_checkpoint_steps=save_checkpoint_steps,
+        summary_dir=summary_dir)
+
   if not is_chief:
     session_creator = WorkerSessionCreator(
         scaffold=scaffold,
         master=master,
         config=config,
         max_wait_secs=max_wait_secs)
-    return MonitoredSession(session_creator=session_creator, hooks=hooks or [],
-                            stop_grace_period_secs=stop_grace_period_secs)
+    return MonitoredSession(
+        session_creator=session_creator,
+        hooks=hooks or [],
+        stop_grace_period_secs=stop_grace_period_secs)
 
   all_hooks = []
   if chief_only_hooks:
@@ -400,25 +479,29 @@
 
     if (save_summaries_steps and save_summaries_steps > 0) or (
         save_summaries_secs and save_summaries_secs > 0):
-      all_hooks.append(basic_session_run_hooks.SummarySaverHook(
-          scaffold=scaffold,
-          save_steps=save_summaries_steps,
-          save_secs=save_summaries_secs,
-          output_dir=summary_dir))
+      all_hooks.append(
+          basic_session_run_hooks.SummarySaverHook(
+              scaffold=scaffold,
+              save_steps=save_summaries_steps,
+              save_secs=save_summaries_secs,
+              output_dir=summary_dir))
 
   if checkpoint_dir:
     if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
         save_checkpoint_steps and save_checkpoint_steps > 0):
-      all_hooks.append(basic_session_run_hooks.CheckpointSaverHook(
-          checkpoint_dir,
-          save_steps=save_checkpoint_steps,
-          save_secs=save_checkpoint_secs,
-          scaffold=scaffold))
+      all_hooks.append(
+          basic_session_run_hooks.CheckpointSaverHook(
+              checkpoint_dir,
+              save_steps=save_checkpoint_steps,
+              save_secs=save_checkpoint_secs,
+              scaffold=scaffold))
 
   if hooks:
     all_hooks.extend(hooks)
-  return MonitoredSession(session_creator=session_creator, hooks=all_hooks,
-                          stop_grace_period_secs=stop_grace_period_secs)
+  return MonitoredSession(
+      session_creator=session_creator,
+      hooks=all_hooks,
+      stop_grace_period_secs=stop_grace_period_secs)
 
 
 @tf_export('train.SessionCreator')
@@ -546,6 +629,11 @@
     self._hooks = hooks or []
     for h in self._hooks:
       h.begin()
+
+    worker_context = distribute_coordinator_context.get_current_worker_context()
+    if not session_creator and worker_context:
+      session_creator = worker_context.session_creator()
+
     # Create the session.
     self._coordinated_creator = self._CoordinatedSessionCreator(
         session_creator=session_creator or ChiefSessionCreator(),
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 92533ca..ff586b6 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -32,6 +32,7 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import debug_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import distribute_coordinator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -381,6 +382,119 @@
         self.assertEqual(0, session.run(gstep))
 
 
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=True,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self._between_graph = between_graph
+    self._should_init = should_init
+    self._should_checkpoint = should_checkpoint
+    self._should_save_summary = should_save_summary
+
+  @property
+  def between_graph(self):
+    return self._between_graph
+
+  @property
+  def should_init(self):
+    return self._should_init
+
+  @property
+  def should_checkpoint(self):
+    return self._should_checkpoint
+
+  @property
+  def should_save_summary(self):
+    return self._should_save_summary
+
+
+class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
+  """Test distribute coordinator controls summary saving and checkpointing."""
+
+  def test_summary_hook_enabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_save_summary=True), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_enabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      summary.scalar('my_summary_tag', new_gstep * 2)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_summaries_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(101):
+          session.run(new_gstep)
+
+    summaries = util_test.latest_summaries(logdir)
+    tags = [s.summary.value[0].tag for s in summaries]
+    self.assertIn('my_summary_tag', tags)
+    self.assertIn('global_step/sec', tags)
+
+  def test_summary_hook_disabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_save_summary=False), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_disabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      summary.scalar('my_summary_tag', new_gstep * 2)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_summaries_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(101):
+          session.run(new_gstep)
+
+    # No summary is saved.
+    summaries = util_test.latest_summaries(logdir)
+    self.assertEqual(len(summaries), 0)
+
+  def test_checkpoint_hook_enabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_checkpoint=True), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_enabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(100, session.run(gstep))
+
+  def test_checkpoint_hook_disabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_checkpoint=False), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_disabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+
+    # No checkpoint is saved.
+    checkpoint = checkpoint_management.latest_checkpoint(logdir)
+    self.assertIsNone(checkpoint)
+
+
 class StopAtNSession(monitored_session._WrappedSession):
   """A wrapped session that stops at the N-th call to _check_stop."""
 
@@ -1365,8 +1479,8 @@
       with monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
               scaffold,
-              checkpoint_filename_with_path=
-              checkpoint_management.latest_checkpoint(logdir))) as session:
+              checkpoint_filename_with_path=checkpoint_management.
+              latest_checkpoint(logdir))) as session:
         self.assertEqual(2, session.run(gstep))
 
   def test_retry_initialization_on_aborted_error(self):
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 60cc54c..4b91d1e 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -300,7 +300,7 @@
      for a given variable.
   *  Build a model normally but load the checkpoint files to evaluate by using
      the shadow variable names.  For this use the `average_name()` method.  See
-     the @{tf.train.Saver} for more
+     the `tf.train.Saver` for more
      information on restoring saved variables.
 
   Example of restoring the shadow variable values:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index f75db08..1b6bce2 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -35,6 +35,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -51,8 +52,8 @@
   # those variables are accessed in another thread during the gradient
   # computation. To get a consistent set of variables, we filter out
   # those with `None` gradients.
-  def filtered_grad_fn(x=None):
-    return [(g, v) for g, v in grad_fn(x) if g is not None]
+  def filtered_grad_fn(*args, **kwargs):
+    return [(g, v) for g, v in grad_fn(*args, **kwargs) if g is not None]
 
   return filtered_grad_fn
 
@@ -464,7 +465,8 @@
         # TODO(josh11b): Test that we handle weight decay in a reasonable way.
         if (distribute_lib.get_loss_reduction() ==
             variable_scope.VariableAggregation.MEAN):
-          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          num_towers = distribution_strategy_context.get_distribution_strategy(
+          ).num_towers
           if num_towers > 1:
             loss_value *= (1. / num_towers)
 
@@ -482,7 +484,8 @@
     # Scale loss if using a "mean" loss reduction and multiple towers.
     if (distribute_lib.get_loss_reduction() ==
         variable_scope.VariableAggregation.MEAN):
-      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      num_towers = distribution_strategy_context.get_distribution_strategy(
+      ).num_towers
       if num_towers > 1:
         loss *= (1. / num_towers)
 
@@ -548,15 +551,15 @@
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
     # Handle DistributionStrategy case.
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       raise RuntimeError("Use `_distributed_apply()` instead of "
                          "`apply_gradients()` in a cross-tower context.")
     # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_lib.has_distribution_strategy():
-      grads_and_vars = get_filtered_grad_fn(lambda _: grads_and_vars)()
-      return distribute_lib.get_tower_context().merge_call(
+    if distribution_strategy_context.has_distribution_strategy():
+      grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
+      return distribution_strategy_context.get_tower_context().merge_call(
           self._distributed_apply, grads_and_vars, global_step, name)
 
     # No DistributionStrategy case.
@@ -799,7 +802,8 @@
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 54d6789..41e62e0 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -56,7 +56,7 @@
 
 %insert("python") %{
 def do_quantize_training_on_graphdef(input_graph, num_bits):
-  """A general quantization scheme is being developed in @{tf.contrib.quantize}.
+  """A general quantization scheme is being developed in `tf.contrib.quantize`.
 
   Consider using that instead, though since it is in the tf.contrib namespace,
   it is not subject to backward compatibility guarantees.
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 213c11c..e35ea81 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -869,7 +869,7 @@
 class Saver(object):
   """Saves and restores variables.
 
-  See @{$variables$Variables}
+  See [Variables](https://tensorflow.org/guide/variables)
   for an overview of variables, saving and restoring.
 
   The `Saver` class adds ops to save and restore variables to and from
@@ -1529,9 +1529,7 @@
       # 1. The checkpoint would not be loaded successfully as is. Try to parse
       # it as an object-based checkpoint.
       try:
-        reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-        object_graph_string = reader.get_tensor(
-            checkpointable.OBJECT_GRAPH_PROTO_KEY)
+        names_to_keys = object_graph_key_mapping(save_path)
       except errors.NotFoundError:
         # 2. This is not an object-based checkpoint, which likely means there
         # is a graph mismatch. Re-raise the original error with
@@ -1546,42 +1544,19 @@
           "may be somewhat fragile, and will re-build the Saver. Instead, "
           "consider loading object-based checkpoints using "
           "tf.train.Checkpoint().")
-      self._restore_from_object_based_checkpoint(
-          sess=sess, save_path=save_path,
-          object_graph_string=object_graph_string)
+      self._object_restore_saver = saver_from_object_based_checkpoint(
+          checkpoint_path=save_path,
+          var_list=self._var_list,
+          builder=self._builder,
+          names_to_keys=names_to_keys,
+          cached_saver=self._object_restore_saver)
+      self._object_restore_saver.restore(sess=sess, save_path=save_path)
     except errors.InvalidArgumentError as err:
       # There is a mismatch between the graph and the checkpoint being loaded.
       # We add a more reasonable error message here to help users (b/110263146)
       raise _wrap_restore_error_with_msg(
           err, "a mismatch between the current graph and the graph")
 
-  def _restore_from_object_based_checkpoint(self, sess, save_path,
-                                            object_graph_string):
-    """A compatibility mode for reading object-based checkpoints."""
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-    object_graph_proto.ParseFromString(object_graph_string)
-    names_to_keys = {}
-    for node in object_graph_proto.nodes:
-      for attribute in node.attributes:
-        names_to_keys[attribute.full_name] = attribute.checkpoint_key
-    saveables = self._builder._ValidateAndSliceInputs(self._var_list)  # pylint: disable=protected-access
-    for saveable in saveables:
-      for spec in saveable.specs:
-        if spec.name not in names_to_keys:
-          raise errors.NotFoundError(
-              None, None,
-              message=("Attempting to load an object-based checkpoint using "
-                       "variable names, but could not find %s in the "
-                       "checkpoint.") % spec.name)
-        spec.name = names_to_keys[spec.name]
-    if self._object_restore_saver is None:
-      # Cache the Saver so multiple restore() calls don't pollute the graph when
-      # graph building. This assumes keys are consistent (i.e. this is the same
-      # type of object-based checkpoint we saw previously).
-      self._object_restore_saver = Saver(saveables)
-    self._object_restore_saver.restore(sess=sess, save_path=save_path)
-
   @staticmethod
   def _add_collection_def(meta_graph_def, key, export_scope=None):
     """Adds a collection to MetaGraphDef protocol buffer.
@@ -1815,3 +1790,92 @@
     proto_type=saver_pb2.SaverDef,
     to_proto=Saver.to_proto,
     from_proto=Saver.from_proto)
+
+
+def object_graph_key_mapping(checkpoint_path):
+  """Return name to key mappings from the checkpoint.
+
+  Args:
+    checkpoint_path: string, path to object-based checkpoint
+
+  Returns:
+    Dictionary mapping tensor names to checkpoint keys.
+  """
+  reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
+  object_graph_string = reader.get_tensor(
+      checkpointable.OBJECT_GRAPH_PROTO_KEY)
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  object_graph_proto.ParseFromString(object_graph_string)
+  names_to_keys = {}
+  for node in object_graph_proto.nodes:
+    for attribute in node.attributes:
+      names_to_keys[attribute.full_name] = attribute.checkpoint_key
+  return names_to_keys
+
+
+def saver_from_object_based_checkpoint(
+    checkpoint_path, var_list=None, builder=None, names_to_keys=None,
+    cached_saver=None):
+  """Return a `Saver` which reads from an object-based checkpoint.
+
+  This function validates that all variables in the variables list are remapped
+  in the object-based checkpoint (or `names_to_keys` dict if provided). A
+  saver will be created with the list of remapped variables.
+
+  The `cached_saver` argument allows the user to pass in a previously created
+  saver, so multiple `saver.restore()` calls don't pollute the graph when graph
+  building. This assumes that keys are consistent, meaning that the
+    1) `checkpoint_path` checkpoint, and
+    2) checkpoint used to create the `cached_saver`
+  are the same type of object-based checkpoint. If this argument is set, this
+  function will simply validate that all variables have been remapped by the
+  checkpoint at `checkpoint_path`.
+
+  Note that in general, `tf.train.Checkpoint` should be used to restore/save an
+  object-based checkpoint.
+
+  Args:
+    checkpoint_path: string, path to object-based checkpoint
+    var_list: list of `Variables` that appear in the checkpoint. If `None`,
+      `var_list` will be set to all saveable objects.
+    builder: a `BaseSaverBuilder` instance. If `None`, a new `BulkSaverBuilder`
+      will be created.
+    names_to_keys: dict mapping string tensor names to checkpooint keys. If
+      `None`, this dict will be generated from the checkpoint file.
+    cached_saver: Cached `Saver` object with remapped variables.
+
+  Returns:
+    `Saver` with remapped variables for reading from an object-based checkpoint.
+
+  Raises:
+    ValueError if the checkpoint provided is not an object-based checkpoint.
+    NotFoundError: If one of the variables in `var_list` can not be found in the
+      checkpoint. This could mean the checkpoint or `names_to_keys` mapping is
+      missing the variable.
+  """
+  if names_to_keys is None:
+    try:
+      names_to_keys = object_graph_key_mapping(checkpoint_path)
+    except errors.NotFoundError:
+      raise ValueError("Checkpoint in %s not an object-based checkpoint."
+                       % checkpoint_path)
+  if var_list is None:
+    var_list = variables._all_saveable_objects()  # pylint: disable=protected-access
+  if builder is None:
+    builder = BulkSaverBuilder()
+
+  saveables = builder._ValidateAndSliceInputs(var_list)  # pylint: disable=protected-access
+  for saveable in saveables:
+    for spec in saveable.specs:
+      if spec.name not in names_to_keys:
+        raise errors.NotFoundError(
+            None, None,
+            message=("Attempting to load an object-based checkpoint using "
+                     "variable names, but could not find %s in the "
+                     "checkpoint.") % spec.name)
+      spec.name = names_to_keys[spec.name]
+
+  if cached_saver is None:
+    return Saver(saveables)
+  return cached_saver
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 941aafc..b55e641 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -784,6 +784,32 @@
       self.assertEqual(20.0, v1.eval())
       save.save(sess, save_path)
 
+  def testSaveRestoreAndValidateVariableDtype(self):
+    for variable_op in [
+        variables.Variable, resource_variable_ops.ResourceVariable
+    ]:
+      save_path = os.path.join(self.get_temp_dir(), "basic_save_restore")
+
+      # Build the first session.
+      with self.test_session(graph=ops_lib.Graph()) as sess:
+        v0 = variable_op(10.0, name="v0", dtype=dtypes.float32)
+
+        if not context.executing_eagerly():
+          self.evaluate([variables.global_variables_initializer()])
+
+        save = saver_module.Saver({"v0": v0})
+        save.save(sess, save_path)
+
+      # Start a second session.
+      with self.test_session(graph=ops_lib.Graph()) as sess:
+        v0_wrong_dtype = variable_op(1, name="v0", dtype=dtypes.int32)
+        # Restore the saved value with different dtype
+        # in the parameter nodes.
+        save = saver_module.Saver({"v0": v0_wrong_dtype})
+        with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                     "original dtype"):
+          save.restore(sess, save_path)
+
   # Test restoring large tensors (triggers a thread pool)
   def testRestoreLargeTensors(self):
     save_dir = self.get_temp_dir()
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 58cf5277..4654341 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -98,9 +98,9 @@
   """An in-process TensorFlow server, for use in distributed training.
 
   A `tf.train.Server` instance encapsulates a set of devices and a
-  @{tf.Session} target that
+  `tf.Session` target that
   can participate in distributed training. A server belongs to a
-  cluster (specified by a @{tf.train.ClusterSpec}), and
+  cluster (specified by a `tf.train.ClusterSpec`), and
   corresponds to a particular task in a named job. The server can
   communicate with any other server in the same cluster.
   """
@@ -186,7 +186,7 @@
     """Returns the target for a `tf.Session` to connect to this server.
 
     To create a
-    @{tf.Session} that
+    `tf.Session` that
     connects to this server, use the following snippet:
 
     ```python
@@ -230,7 +230,7 @@
 
   A `tf.train.ClusterSpec` represents the set of processes that
   participate in a distributed TensorFlow computation. Every
-  @{tf.train.Server} is constructed in a particular cluster.
+  `tf.train.Server` is constructed in a particular cluster.
 
   To create a cluster with two jobs and five tasks, you specify the
   mapping from job names to lists of network addresses (typically
@@ -421,7 +421,7 @@
     NOTE: For backwards compatibility, this method returns a list. If
     the given job was defined with a sparse set of task indices, the
     length of this list may not reflect the number of tasks defined in
-    this job. Use the @{tf.train.ClusterSpec.num_tasks} method
+    this job. Use the `tf.train.ClusterSpec.num_tasks` method
     to find the number of tasks defined in a particular job.
 
     Args:
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 258a6f0..d76b22a 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -45,7 +45,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
@@ -112,7 +112,8 @@
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
@@ -149,7 +150,8 @@
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 372ea41..0755364 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -45,7 +45,7 @@
   """A training helper that checkpoints models and computes summaries.
 
   This class is deprecated. Please use
-  @{tf.train.MonitoredTrainingSession} instead.
+  `tf.train.MonitoredTrainingSession` instead.
 
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
@@ -134,7 +134,7 @@
 
   * Specifying `'local'` requests a session that uses the RPC-based
     "Master interface" to run TensorFlow programs. See
-    @{tf.train.Server.create_local_server} for
+    `tf.train.Server.create_local_server` for
     details.
 
   * Specifying `'grpc://hostname:port'` requests a session that uses
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 544010a..686c4be 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -15,7 +15,7 @@
 
 """Support for training models.
 
-See the @{$python/train} guide.
+See the [Training](https://tensorflow.org/api_guides/python/train) guide.
 """
 
 # Optimizers.
@@ -53,6 +53,7 @@
 from tensorflow.python.training.input import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
+from tensorflow.python.training.basic_session_run_hooks import get_or_create_steps_per_run_variable
 from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
 from tensorflow.python.training.basic_session_run_hooks import LoggingTensorHook
 from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index b1a7cfa..0ba7ba9 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -44,7 +44,7 @@
     ])):
   """Vocabulary information for warm-starting.
 
-  See @{tf.estimator.WarmStartSettings$WarmStartSettings} for examples of using
+  See `tf.estimator.WarmStartSettings` for examples of using
   VocabInfo to warm-start.
 
   Attributes:
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 74e1fb2..c43589f 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -393,8 +393,8 @@
     Returns:
       Dictionary from arg_name to DeprecatedArgSpec.
     """
-    arg_name_to_pos = dict(
-        (name, pos) for (pos, name) in enumerate(arg_spec.args))
+    arg_name_to_pos = {
+        name: pos for pos, name in enumerate(arg_spec.args)}
     deprecated_positional_args = {}
     for arg_name, spec in iter(names_to_ok_vals.items()):
       if arg_name in arg_name_to_pos:
diff --git a/tensorflow/python/util/serialization_test.py b/tensorflow/python/util/serialization_test.py
index 9d9cac2..6df7533 100644
--- a/tensorflow/python/util/serialization_test.py
+++ b/tensorflow/python/util/serialization_test.py
@@ -55,11 +55,8 @@
     model(constant_op.constant([[1.]]))
     sequential_round_trip = json.loads(
         json.dumps(model, default=serialization.get_json_type))
-    self.assertEqual(5, sequential_round_trip["config"][1]["config"]["units"])
-    input_round_trip = json.loads(
-        json.dumps(model._input_layers, default=serialization.get_json_type))
-    self.assertAllEqual([1, 1],
-                        input_round_trip[0]["config"]["batch_input_shape"])
+    self.assertEqual(
+        5, sequential_round_trip["config"]["layers"][1]["config"]["units"])
 
   @test_util.run_in_graph_and_eager_modes
   def test_serialize_model(self):
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 28e49af..ca6710b 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -17,23 +17,124 @@
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import types
+import copy
+import sys
+import traceback
 
 import six  # pylint: disable=unused-import
 
-from tensorflow.python.eager import context
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 # pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
-# TODO(b/65412899): Re-implement to avoid leaking python objects.
-# This function / class remains since the API is public (mark_used()).
+class _TFShouldUseHelper(object):
+  """Object stored in TFShouldUse-wrapped objects.
+
+  When it is deleted it will emit a warning or error if its `sate` method
+  has not been called by time of deletion.
+  """
+
+  def __init__(self, type_, repr_, stack_frame, fatal_error_if_unsated):
+    self._type = type_
+    self._repr = repr_
+    self._stack_frame = stack_frame
+    self._fatal_error_if_unsated = fatal_error_if_unsated
+    self._sated = False
+
+  def sate(self):
+    self._sated = True
+    self._type = None
+    self._repr = None
+    self._stack_frame = None
+    self._logging_module = None
+
+  def __del__(self):
+    if self._sated:
+      return
+    if self._fatal_error_if_unsated:
+      logger = tf_logging.fatal
+    else:
+      logger = tf_logging.error
+    creation_stack = ''.join(
+        [line.rstrip() for line in traceback.format_stack(self._stack_frame)])
+    logger(
+        '==================================\n'
+        'Object was never used (type %s):\n%s\nIf you want to mark it as '
+        'used call its "mark_used()" method.\nIt was originally created '
+        'here:\n%s\n'
+        '==================================' %
+        (self._type, self._repr, creation_stack))
+
+
+def _new__init__(self, true_value, tf_should_use_helper):
+  # pylint: disable=protected-access
+  self._tf_should_use_helper = tf_should_use_helper
+  self._true_value = true_value
+
+
+def _new__setattr__(self, key, value):
+  if key in ('_tf_should_use_helper', '_true_value'):
+    return object.__setattr__(self, key, value)
+  return setattr(
+      object.__getattribute__(self, '_true_value'),
+      key, value)
+
+
+def _new__getattribute__(self, key):
+  if key not in ('_tf_should_use_helper', '_true_value'):
+    object.__getattribute__(self, '_tf_should_use_helper').sate()
+  if key in ('_tf_should_use_helper', 'mark_used', '__setatt__'):
+    return object.__getattribute__(self, key)
+  return getattr(object.__getattribute__(self, '_true_value'), key)
+
+
+def _new_mark_used(self, *args, **kwargs):
+  object.__getattribute__(self, '_tf_should_use_helper').sate()
+  try:
+    mu = object.__getattribute__(
+        object.__getattribute__(self, '_true_value'),
+        'mark_used')
+    return mu(*args, **kwargs)
+  except AttributeError:
+    pass
+
+
+_WRAPPERS = dict()
+
+
+def _get_wrapper(x, tf_should_use_helper):
+  """Create a wrapper for object x, whose class subclasses type(x).
+
+  The wrapper will emit a warning if it is deleted without any of its
+  properties being accessed or methods being called.
+
+  Args:
+    x: The instance to wrap.
+    tf_should_use_helper: The object that tracks usage.
+
+  Returns:
+    An object wrapping `x`, of type `type(x)`.
+  """
+  type_x = type(x)
+  memoized = _WRAPPERS.get(type_x, None)
+  if memoized:
+    return memoized(x, tf_should_use_helper)
+
+  tx = copy.deepcopy(type_x)
+  copy_tx = type(tx.__name__, tx.__bases__, dict(tx.__dict__))
+  copy_tx.__init__ = _new__init__
+  copy_tx.__getattribute__ = _new__getattribute__
+  copy_tx.mark_used = _new_mark_used
+  copy_tx.__setattr__ = _new__setattr__
+  _WRAPPERS[type_x] = copy_tx
+
+  return copy_tx(x, tf_should_use_helper)
+
+
 def _add_should_use_warning(x, fatal_error=False):
   """Wraps object x so that if it is never used, a warning is logged.
 
-  Does nothing when executing eagerly.
-
   Args:
     x: Python object.
     fatal_error: Python bool.  If `True`, tf.logging.fatal is raised
@@ -43,50 +144,22 @@
     An instance of `TFShouldUseWarningWrapper` which subclasses `type(x)`
     and is a very shallow wrapper for `x` which logs access into `x`.
   """
-  del fatal_error
   if x is None or x == []:  # pylint: disable=g-explicit-bool-comparison
     return x
 
-  if context.executing_eagerly():
-    # Typically not needed when executing eagerly (the main use case is for ops
-    # which need to be incorporated into the graph), and even the no-op wrapper
-    # creates reference cycles which require garbage collection.
-    return x
+  # Extract the current frame for later use by traceback printing.
+  try:
+    raise ValueError()
+  except ValueError:
+    stack_frame = sys.exc_info()[2].tb_frame.f_back
 
-  def override_method(method):
-    def fn(self, *args, **kwargs):
-      return method(self, *args, **kwargs)
-    return fn
+  tf_should_use_helper = _TFShouldUseHelper(
+      type_=type(x),
+      repr_=repr(x),
+      stack_frame=stack_frame,
+      fatal_error_if_unsated=fatal_error)
 
-  class TFShouldUseWarningWrapper(type(x)):
-    """Wrapper for objects that keeps track of their use."""
-
-    def __init__(self, true_self):
-      self.__dict__ = true_self.__dict__
-
-    # Not sure why this pylint warning is being used; this is not an
-    # old class form.
-    # pylint: disable=super-on-old-class
-    def __getattribute__(self, name):
-      return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
-
-    def mark_used(self, *args, **kwargs):
-      return
-
-    # pylint: enable=super-on-old-class
-
-  for name in dir(TFShouldUseWarningWrapper):
-    method = getattr(TFShouldUseWarningWrapper, name)
-    if not isinstance(method, types.FunctionType):
-      continue
-    if name in ('__init__', '__getattribute__', '__del__', 'mark_used'):
-      continue
-    setattr(TFShouldUseWarningWrapper, name,
-            functools.wraps(method)(override_method(method)))
-
-  wrapped = TFShouldUseWarningWrapper(x)
-  wrapped.__doc__ = x.__doc__  # functools.wraps fails on some objects.
-  return wrapped
+  return _get_wrapper(x, tf_should_use_helper)
 
 
 def should_use_result(fn):
@@ -106,8 +179,6 @@
   - `t != 0`.  In this case, comparison is done on types / ids.
   - `isinstance(t, tf.Tensor)`.  Similar to above.
 
-  Does nothing when executing eagerly.
-
   Args:
     fn: The function to wrap.
 
@@ -142,8 +213,6 @@
   - `t != 0`.  In this case, comparison is done on types / ids.
   - `isinstance(t, tf.Tensor)`.  Similar to above.
 
-  Does nothing when executing eagerly.
-
   Args:
     fn: The function to wrap.
 
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 4c6e48b..16fa1f5 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -30,48 +30,51 @@
 
 
 @contextlib.contextmanager
-def reroute_error(captured):
+def reroute_error():
   """Temporarily reroute errors written to tf_logging.error into `captured`."""
-  del captured[:]
-  true_logger = tf_logging.error
-  def capture_errors(*args, **unused_kwargs):
-    captured.extend(args)
-  tf_logging.error = capture_errors
-  try:
-    yield
-  finally:
-    tf_logging.error = true_logger
+  with test.mock.patch.object(tf_should_use.tf_logging, 'error') as error:
+    with test.mock.patch.object(tf_should_use.tf_logging, 'fatal') as fatal:
+      yield error, fatal
 
 
 class TfShouldUseTest(test.TestCase):
 
   def testAddShouldUseWarningWhenNotUsed(self):
-    self.skipTest('b/65412899')
     c = constant_op.constant(0, name='blah0')
-    captured = []
-    with reroute_error(captured):
-      def in_this_function():
-        h = tf_should_use._add_should_use_warning(c)
-        del h
+    def in_this_function():
+      h = tf_should_use._add_should_use_warning(c)
+      del h
+    with reroute_error() as (error, _):
       in_this_function()
-    self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah0:0', '\n'.join(captured))
-    self.assertIn('in_this_function', '\n'.join(captured))
-    gc.collect()
+    msg = '\n'.join(error.call_args[0])
+    self.assertIn('Object was never used', msg)
+    self.assertIn('blah0:0', msg)
+    self.assertIn('in_this_function', msg)
+    self.assertFalse(gc.garbage)
+
+  def testAddShouldUseFatalWhenNotUsed(self):
+    c = constant_op.constant(0, name='blah0')
+    def in_this_function():
+      h = tf_should_use._add_should_use_warning(c, fatal_error=True)
+      del h
+    with reroute_error() as (_, fatal):
+      in_this_function()
+    msg = '\n'.join(fatal.call_args[0])
+    self.assertIn('Object was never used', msg)
+    self.assertIn('blah0:0', msg)
+    self.assertIn('in_this_function', msg)
     self.assertFalse(gc.garbage)
 
   def _testAddShouldUseWarningWhenUsed(self, fn, name):
     c = constant_op.constant(0, name=name)
-    captured = []
-    with reroute_error(captured):
+    with reroute_error() as (error, fatal):
       h = tf_should_use._add_should_use_warning(c)
       fn(h)
       del h
-    self.assertNotIn('Object was never used', '\n'.join(captured))
-    self.assertNotIn('%s:0' % name, '\n'.join(captured))
+    error.assert_not_called()
+    fatal.assert_not_called()
 
   def testAddShouldUseWarningWhenUsedWithAdd(self):
-    self.skipTest('b/65412899')
     def add(h):
       _ = h + 1
     self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
@@ -79,7 +82,6 @@
     self.assertFalse(gc.garbage)
 
   def testAddShouldUseWarningWhenUsedWithGetName(self):
-    self.skipTest('b/65412899')
     def get_name(h):
       _ = h.name
     self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
@@ -87,35 +89,33 @@
     self.assertFalse(gc.garbage)
 
   def testShouldUseResult(self):
-    self.skipTest('b/65412899')
     @tf_should_use.should_use_result
     def return_const(value):
       return constant_op.constant(value, name='blah2')
-    captured = []
-    with reroute_error(captured):
+    with reroute_error() as (error, _):
       return_const(0.0)
-    self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah2:0', '\n'.join(captured))
-    self.assertIn('return_const', '\n'.join(captured))
+    msg = '\n'.join(error.call_args[0])
+    self.assertIn('Object was never used', msg)
+    self.assertIn('blah2:0', msg)
+    self.assertIn('return_const', msg)
     gc.collect()
     self.assertFalse(gc.garbage)
 
   def testShouldUseResultWhenNotReallyUsed(self):
-    self.skipTest('b/65412899')
     @tf_should_use.should_use_result
     def return_const(value):
       return constant_op.constant(value, name='blah3')
-    captured = []
-    with reroute_error(captured):
+    with reroute_error() as (error, _):
       with self.test_session():
         return_const(0.0)
         # Creating another op and executing it does not mark the
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
         v.eval()
-    self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah3:0', '\n'.join(captured))
-    self.assertIn('return_const', '\n'.join(captured))
+    msg = '\n'.join(error.call_args[0])
+    self.assertIn('Object was never used', msg)
+    self.assertIn('blah3:0', msg)
+    self.assertIn('return_const', msg)
     gc.collect()
     self.assertFalse(gc.garbage)
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index e742f8e..d4d9708 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -30,6 +30,7 @@
     hdrs = STREAM_EXECUTOR_HEADERS,
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
         "//conditions:default": ["-ldl"],
     }),
     visibility = ["//visibility:public"],
@@ -79,6 +80,7 @@
     }),
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
         "//conditions:default": ["-ldl"],
     }),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 725f6ae..55408ab 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1986,15 +1986,14 @@
 
 port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
-    const dnn::AlgorithmDesc& algorithm_desc,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2002,8 +2001,14 @@
       cudnn.handle(),
       /*xDesc=*/input_nd.handle(),
       /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
+      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(*algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
+
+  if (TF_PREDICT_FALSE(!algorithm_desc)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No AlgorithmDesc provided");
+  }
+  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2028,15 +2033,14 @@
 port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardDataWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
-    const dnn::AlgorithmDesc& algorithm_desc,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2046,8 +2050,14 @@
       /*dyDesc=*/output_nd.handle(),
       /*convDesc=*/conv.handle(),
       /*dxDesc=*/input_nd.handle(),
-      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+      /*algo=*/ToConvBackwardDataAlgo(*algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
+
+  if (TF_PREDICT_FALSE(!algorithm_desc)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No AlgorithmDesc provided");
+  }
+  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2072,15 +2082,14 @@
 port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardFilterWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
-    const dnn::AlgorithmDesc& algorithm_desc,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2090,8 +2099,14 @@
       /*dyDesc=*/output_nd.handle(),
       /*convDesc=*/conv.handle(),
       /*gradDesc=*/filter.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+      /*algo=*/ToConvBackwardFilterAlgo(*algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
+
+  if (TF_PREDICT_FALSE(!algorithm_desc)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No AlgorithmDesc provided");
+  }
+  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2138,7 +2153,7 @@
   }
 
   auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
-      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
@@ -2155,11 +2170,11 @@
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(
-      *scratch, AllocateCudnnConvolutionForwardWorkspace(
-                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
-                    input_nd, filter, conv, output_nd, scratch_allocator));
-  return algorithm_config.algorithm_no_scratch();
+  algo_desc = algorithm_config.algorithm_no_scratch();
+  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
+                                    stream, cudnn, input_nd, filter, conv,
+                                    output_nd, &algo_desc, scratch_allocator));
+  return algo_desc;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
@@ -2187,7 +2202,7 @@
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
-      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
@@ -2204,11 +2219,11 @@
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(
-      *scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
-                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
-                    input_nd, filter, conv, output_nd, scratch_allocator));
-  return algorithm_config.algorithm_no_scratch();
+  algo_desc = algorithm_config.algorithm_no_scratch();
+  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
+                                    stream, cudnn, input_nd, filter, conv,
+                                    output_nd, &algo_desc, scratch_allocator));
+  return algo_desc;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
@@ -2236,7 +2251,7 @@
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
-      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
@@ -2253,11 +2268,11 @@
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(*scratch,
-                      AllocateCudnnConvolutionBackwardFilterWorkspace(
-                          stream, cudnn, algorithm_config.algorithm(), input_nd,
-                          filter, conv, output_nd, scratch_allocator));
-  return algorithm_config.algorithm_no_scratch();
+  algo_desc = algorithm_config.algorithm_no_scratch();
+  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
+                                    stream, cudnn, input_nd, filter, conv,
+                                    output_nd, &algo_desc, scratch_allocator));
+  return algo_desc;
 }
 
 // A helper class to set env-vars and choose options for cudnn-related
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index a7449c2..9abfa1d 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -713,15 +713,23 @@
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {}
+  AlgorithmDesc()
+      : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true), scratch_size_(0) {}
   AlgorithmDesc(Index a, bool use_tensor_ops)
-      : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
+      : algo_(a), tensor_ops_enabled_(use_tensor_ops), scratch_size_(0) {}
+  AlgorithmDesc(Index a, bool use_tensor_ops, size_t scratch_size)
+      : algo_(a),
+        tensor_ops_enabled_(use_tensor_ops),
+        scratch_size_(scratch_size) {}
   bool is_default() const { return algo_ == kDefaultAlgorithm; }
   bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
   Index algo_id() const { return algo_; }
+  size_t scratch_size() const { return scratch_size_; }
+  void set_scratch_size(size_t val) { scratch_size_ = val; }
   bool operator==(const AlgorithmDesc& other) const {
     return this->algo_ == other.algo_ &&
-           this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+           this->tensor_ops_enabled_ == other.tensor_ops_enabled_ &&
+           this->scratch_size_ == other.scratch_size_;
   }
   uint64 hash() const;
 
@@ -729,6 +737,7 @@
   enum { kDefaultAlgorithm = -1 };
   Index algo_;
   bool tensor_ops_enabled_;
+  size_t scratch_size_;
 };
 
 // Describes the result from a perf experiment.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 858396e..7ba1f18 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -88,7 +88,7 @@
                 uint64 size) override;
 
   // No "synchronize all activity" implemented for this platform at the moment.
-  bool SynchronizeAllActivity() override { return false; }
+  bool SynchronizeAllActivity() override { return true; }
   bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
 
   bool SynchronousMemSet(DeviceMemoryBase *location, int value,
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 5a7d3b3..bfbfb56 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -28,18 +28,28 @@
 HostStream::~HostStream() {}
 
 bool HostStream::EnqueueTask(std::function<void()> task) {
+  struct NotifiedTask {
+    HostStream* stream;
+    std::function<void()> task;
+
+    void operator()() {
+      task();
+      // Destroy the task before unblocking its waiters, as BlockHostUntilDone()
+      // should guarantee that all tasks are destroyed.
+      task = std::function<void()>();
+      {
+        mutex_lock lock(stream->mu_);
+        --stream->pending_tasks_;
+      }
+      stream->completion_condition_.notify_all();
+    }
+  };
+
   {
     mutex_lock lock(mu_);
     ++pending_tasks_;
   }
-  host_executor_->Schedule([this, task]() {
-    task();
-    {
-      mutex_lock lock(mu_);
-      --pending_tasks_;
-    }
-    completion_condition_.notify_all();
-  });
+  host_executor_->Schedule(NotifiedTask{this, std::move(task)});
   return true;
 }
 
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 92e5376..59a477b 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -236,7 +236,7 @@
   virtual bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
                       const void *host_src, uint64 size) = 0;
   virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                                    const DeviceMemoryBase &host_src,
+                                    const DeviceMemoryBase &gpu_src,
                                     uint64 size) = 0;
   virtual bool HostCallback(Stream *stream, std::function<void()> callback) = 0;
   virtual bool HostCallback(Stream *stream,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 39db840..c7766f3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -4,12 +4,12 @@
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-    "tf_sycl_tests_tags",
+    "if_dynamic_kernels",
+    "if_static",
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
-    "if_static",
-    "if_dynamic_kernels",
+    "tf_cuda_tests_tags",
+    "tf_sycl_tests_tags",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -17,13 +17,15 @@
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
     "cuda_default_copts",
+    "if_cuda",
 )
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
-    "if_mkl_lnx_x64"
+    "if_mkl_lnx_x64",
+    "if_mkl_ml",
+    "mkl_deps",
 )
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
@@ -36,155 +38,154 @@
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
-  return src.replace("/", "_").split(".")[0]
+    return src.replace("/", "_").split(".")[0]
 
 def full_path(relative_paths):
-  return [native.package_name() + "/" + relative for relative in relative_paths]
+    return [native.package_name() + "/" + relative for relative in relative_paths]
 
 def _add_tfcore_prefix(src):
-  if src.startswith("//"):
-    return src
-  return "//tensorflow/core:" + src
+    if src.startswith("//"):
+        return src
+    return "//tensorflow/core:" + src
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
-  return [
-      _add_tfcore_prefix(p) for p in core_proto_sources_relative
-  ]
+    return [
+        _add_tfcore_prefix(p)
+        for p in core_proto_sources_relative
+    ]
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
-  return ([
-      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
-      for p in core_proto_sources_relative
-  ] + [
-      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
-      for p in core_proto_sources_relative
-  ])
+    return ([
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
+        for p in core_proto_sources_relative
+    ] + [
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
+        for p in core_proto_sources_relative
+    ])
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
-  return str(Label(dep))
+    return str(Label(dep))
 
 def if_android_x86(a):
-  return select({
-      clean_dep("//tensorflow:android_x86"): a,
-      clean_dep("//tensorflow:android_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_x86"): a,
+        clean_dep("//tensorflow:android_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_arm(a):
-  return select({
-      clean_dep("//tensorflow:android_arm"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_arm"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_arm64(a):
-  return select({
-      clean_dep("//tensorflow:android_arm64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_arm64"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_mips(a):
-  return select({
-      clean_dep("//tensorflow:android_mips"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_mips"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_android(a):
-  return select({
-      clean_dep("//tensorflow:android"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android"): [],
+        "//conditions:default": a,
+    })
 
 def if_not_android_mips_and_mips64(a):
-  return select({
-      clean_dep("//tensorflow:android_mips"): [],
-      clean_dep("//tensorflow:android_mips64"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android_mips"): [],
+        clean_dep("//tensorflow:android_mips64"): [],
+        "//conditions:default": a,
+    })
 
 def if_android(a):
-  return select({
-      clean_dep("//tensorflow:android"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android"): a,
+        "//conditions:default": [],
+    })
 
 def if_ios(a):
-  return select({
-      clean_dep("//tensorflow:ios"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:ios"): a,
+        "//conditions:default": [],
+    })
 
 def if_ios_x86_64(a):
-  return select({
-      clean_dep("//tensorflow:ios_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:ios_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_mobile(a):
-  return select({
-      clean_dep("//tensorflow:android"): a,
-      clean_dep("//tensorflow:ios"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android"): a,
+        clean_dep("//tensorflow:ios"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_mobile(a):
-  return select({
-      clean_dep("//tensorflow:android"): [],
-      clean_dep("//tensorflow:ios"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android"): [],
+        clean_dep("//tensorflow:ios"): [],
+        "//conditions:default": a,
+    })
 
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
 def if_not_lgpl_restricted(a):
-  _ = (a,)
-  return select({
-      "//conditions:default": [],
-  })
+    _ = (a,)
+    return select({
+        "//conditions:default": [],
+    })
 
 def if_not_windows(a):
-  return select({
-      clean_dep("//tensorflow:windows"): [],
-      clean_dep("//tensorflow:windows_msvc"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": a,
+    })
 
 def if_windows(a):
-  return select({
-      clean_dep("//tensorflow:windows"): a,
-      clean_dep("//tensorflow:windows_msvc"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:windows"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_windows_cuda(a):
-  return select({
-      clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
+        "//conditions:default": a,
+    })
 
 def if_linux_x86_64(a):
-  return select({
-      clean_dep("//tensorflow:linux_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:linux_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_darwin(a):
-  return select({
-      clean_dep("//tensorflow:darwin"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:darwin"): a,
+        "//conditions:default": [],
+    })
 
 def if_override_eigen_strong_inline(a):
-  return select({
-      clean_dep("//tensorflow:override_eigen_strong_inline"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:override_eigen_strong_inline"): a,
+        "//conditions:default": [],
+    })
 
-def get_win_copts(is_external=False):
+def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
@@ -202,163 +203,169 @@
         "/DNOGDI",
     ]
     if is_external:
-      return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
+        return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
     else:
-      return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
+        return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
 
 # LINT.IfChange
-def tf_copts(android_optimization_level_override="-O2", is_external=False):
-  # For compatibility reasons, android_optimization_level_override
-  # is currently only being set for Android.
-  # To clear this value, and allow the CROSSTOOL default
-  # to be used, pass android_optimization_level_override=None
-  android_copts = [
-      "-std=c++11",
-      "-DTF_LEAN_BINARY",
-      "-Wno-narrowing",
-      "-fomit-frame-pointer",
-  ]
-  if android_optimization_level_override:
-    android_copts.append(android_optimization_level_override)
-  return (
-      if_not_windows([
-          "-DEIGEN_AVOID_STL_ARRAY",
-          "-Iexternal/gemmlowp",
-          "-Wno-sign-compare",
-          "-fno-exceptions",
-          "-ftemplate-depth=900"])
-      + if_cuda(["-DGOOGLE_CUDA=1"])
-      + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
-      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"])
-      + if_mkl_open_source_only(["-DDO_NOT_USE_ML"])
-      + if_mkl_lnx_x64(["-fopenmp"])
-      + if_android_arm(["-mfpu=neon"])
-      + if_linux_x86_64(["-msse3"])
-      + if_ios_x86_64(["-msse4.1"])
-      + select({
+def tf_copts(android_optimization_level_override = "-O2", is_external = False):
+    # For compatibility reasons, android_optimization_level_override
+    # is currently only being set for Android.
+    # To clear this value, and allow the CROSSTOOL default
+    # to be used, pass android_optimization_level_override=None
+    android_copts = [
+        "-std=c++11",
+        "-DTF_LEAN_BINARY",
+        "-Wno-narrowing",
+        "-fomit-frame-pointer",
+    ]
+    if android_optimization_level_override:
+        android_copts.append(android_optimization_level_override)
+    return (
+        if_not_windows([
+            "-DEIGEN_AVOID_STL_ARRAY",
+            "-Iexternal/gemmlowp",
+            "-Wno-sign-compare",
+            "-fno-exceptions",
+            "-ftemplate-depth=900",
+        ]) +
+        if_cuda(["-DGOOGLE_CUDA=1"]) +
+        if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
+        if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
+        if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
+        if_mkl_lnx_x64(["-fopenmp"]) +
+        if_android_arm(["-mfpu=neon"]) +
+        if_linux_x86_64(["-msse3"]) +
+        if_ios_x86_64(["-msse4.1"]) +
+        select({
             clean_dep("//tensorflow:framework_shared_object"): [],
             "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
-      })
-      + select({
+        }) +
+        select({
             clean_dep("//tensorflow:android"): android_copts,
             clean_dep("//tensorflow:darwin"): [],
             clean_dep("//tensorflow:windows"): get_win_copts(is_external),
-            clean_dep("//tensorflow:windows_msvc"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
             clean_dep("//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
-            "//conditions:default": ["-pthread"]
-      }))
-
+            "//conditions:default": ["-pthread"],
+        })
+    )
 
 def tfe_xla_copts():
-  return select({
-      "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
+        "//conditions:default": [],
+    })
 
 def tf_opts_nortti_if_android():
-  return if_android([
-      "-fno-rtti",
-      "-DGOOGLE_PROTOBUF_NO_RTTI",
-      "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
-  ])
+    return if_android([
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ])
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
 def tf_features_nomodules_if_android():
-  return if_android(["-use_header_modules"])
+    return if_android(["-use_header_modules"])
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None, is_external=True):
-  # Make library out of each op so it can also be used to generate wrappers
-  # for various languages.
-  if not deps:
-    deps = []
-  for n in op_lib_names:
-    native.cc_library(
-        name=n + "_op_lib",
-        copts=tf_copts(is_external=is_external),
-        srcs=["ops/" + n + ".cc"],
-        deps=deps + [clean_dep("//tensorflow/core:framework")],
-        visibility=["//visibility:public"],
-        alwayslink=1,
-        linkstatic=1,)
+def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
+    # Make library out of each op so it can also be used to generate wrappers
+    # for various languages.
+    if not deps:
+        deps = []
+    for n in op_lib_names:
+        native.cc_library(
+            name = n + "_op_lib",
+            copts = tf_copts(is_external = is_external),
+            srcs = ["ops/" + n + ".cc"],
+            deps = deps + [clean_dep("//tensorflow/core:framework")],
+            visibility = ["//visibility:public"],
+            alwayslink = 1,
+            linkstatic = 1,
+        )
 
 def _make_search_paths(prefix, levels_to_root):
-  return ",".join(
-      ["-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
-       for search_level in range(levels_to_root + 1)])
+    return ",".join(
+        [
+            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
+            for search_level in range(levels_to_root + 1)
+        ],
+    )
 
 def _rpath_linkopts(name):
-  # Search parent directories up to the TensorFlow root directory for shared
-  # object dependencies, even if this op shared object is deeply nested
-  # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
-  # the root and tensorflow/libtensorflow_framework.so should exist when
-  # deployed. Other shared object dependencies (e.g. shared between contrib/
-  # ops) are picked up as long as they are in either the same or a parent
-  # directory in the tensorflow/ tree.
-  levels_to_root = native.package_name().count("/") + name.count("/")
-  return select({
-      clean_dep("//tensorflow:darwin"): [
-          "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      clean_dep("//tensorflow:windows_msvc"): [],
-      "//conditions:default": [
-          "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
-      ],
-  })
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("//tensorflow:darwin"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
+        ],
+    })
 
 # Bazel-generated shared objects which must be linked into TensorFlow binaries
 # to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
 def tf_binary_additional_srcs():
-  return if_static(
-      extra_deps=[],
-      otherwise=[
-          clean_dep("//tensorflow:libtensorflow_framework.so"),
-      ])
-
+    return if_static(
+        extra_deps = [],
+        otherwise = [
+            clean_dep("//tensorflow:libtensorflow_framework.so"),
+        ],
+    )
 
 # Helper functions to add kernel dependencies to tf binaries when using dynamic
 # kernel linking.
 def tf_binary_dynamic_kernel_dsos(kernels):
-  return if_dynamic_kernels(
-      extra_deps=["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
-      otherwise=[])
+    return if_dynamic_kernels(
+        extra_deps = ["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
+        otherwise = [],
+    )
 
 # Helper functions to add kernel dependencies to tf binaries when using static
 # kernel linking.
 def tf_binary_dynamic_kernel_deps(kernels):
-  return if_dynamic_kernels(
-      extra_deps=[],
-      otherwise=kernels)
+    return if_dynamic_kernels(
+        extra_deps = [],
+        otherwise = kernels,
+    )
 
 def tf_cc_shared_object(
-    name,
-    srcs=[],
-    deps=[],
-    data=[],
-    linkopts=[],
-    framework_so=tf_binary_additional_srcs(),
-    kernels=[],
-    **kwargs):
-  native.cc_binary(
-      name=name,
-      srcs=srcs + framework_so,
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels),
-      linkshared = 1,
-      data = data + tf_binary_dynamic_kernel_dsos(kernels),
-      linkopts=linkopts + _rpath_linkopts(name) + select({
-          clean_dep("//tensorflow:darwin"): [
-              "-Wl,-install_name,@rpath/" + name.split("/")[-1],
-          ],
-          clean_dep("//tensorflow:windows"): [],
-          "//conditions:default": [
-              "-Wl,-soname," + name.split("/")[-1],
-          ],
-      }),
-      **kwargs)
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = [],
+        framework_so = tf_binary_additional_srcs(),
+        kernels = [],
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        srcs = srcs + framework_so,
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels),
+        linkshared = 1,
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        linkopts = linkopts + _rpath_linkopts(name) + select({
+            clean_dep("//tensorflow:darwin"): [
+                "-Wl,-install_name,@rpath/" + name.split("/")[-1],
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            "//conditions:default": [
+                "-Wl,-soname," + name.split("/")[-1],
+            ],
+        }),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_shared_object",
@@ -369,26 +376,28 @@
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
 # statically. Also adds linker options (rpaths) so that the framework shared
 # object can be found.
-def tf_cc_binary(name,
-                 srcs=[],
-                 deps=[],
-                 data=[],
-                 linkopts=[],
-                 copts=tf_copts(),
-                 kernels=[],
-                 **kwargs):
-  native.cc_binary(
-      name=name,
-      copts=copts,
-      srcs=srcs + tf_binary_additional_srcs(),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data +  tf_binary_dynamic_kernel_dsos(kernels),
-      linkopts=linkopts + _rpath_linkopts(name),
-      **kwargs)
+def tf_cc_binary(
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = [],
+        copts = tf_copts(),
+        kernels = [],
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        srcs = srcs + tf_binary_additional_srcs(),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                "//third_party/mkl:intel_binary_blob",
+            ],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        linkopts = linkopts + _rpath_linkopts(name),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_binary",
@@ -398,64 +407,72 @@
 # A simple wrap around native.cc_binary rule.
 # When using this rule, you should realize it doesn't link to any tensorflow
 # dependencies by default.
-def tf_native_cc_binary(name,
-                        copts=tf_copts(),
-                        **kwargs):
-  native.cc_binary(
-      name=name,
-      copts=copts,
-      **kwargs)
+def tf_native_cc_binary(
+        name,
+        copts = tf_copts(),
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_native_cc_binary",
     label_regex_for_dep = "{extension_name}.*",
 )
 
-def tf_gen_op_wrapper_cc(name,
-                         out_ops_file,
-                         pkg="",
-                         op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
-                         deps=None,
-                         include_internal_ops=0,
-                         # ApiDefs will be loaded in the order specified in this list.
-                         api_def_srcs=[]):
-  # Construct an op generator binary for these ops.
-  tool = out_ops_file + "_gen_cc"
-  if deps == None:
-    deps = [pkg + ":" + name + "_op_lib"]
-  tf_cc_binary(
-      name=tool,
-      copts=tf_copts(),
-      linkopts=if_not_windows(["-lm"]),
-      linkstatic=1,  # Faster to link this one-time-use binary dynamically
-      deps=[op_gen] + deps)
+def tf_gen_op_wrapper_cc(
+        name,
+        out_ops_file,
+        pkg = "",
+        op_gen = clean_dep("//tensorflow/cc:cc_op_gen_main"),
+        deps = None,
+        include_internal_ops = 0,
+        # ApiDefs will be loaded in the order specified in this list.
+        api_def_srcs = []):
+    # Construct an op generator binary for these ops.
+    tool = out_ops_file + "_gen_cc"
+    if deps == None:
+        deps = [pkg + ":" + name + "_op_lib"]
+    tf_cc_binary(
+        name = tool,
+        copts = tf_copts(),
+        linkopts = if_not_windows(["-lm"]),
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        deps = [op_gen] + deps,
+    )
 
-  srcs = api_def_srcs[:]
+    srcs = api_def_srcs[:]
 
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          " $$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                " $$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
 
-  native.genrule(
-      name=name + "_genrule",
-      outs=[
-          out_ops_file + ".h", out_ops_file + ".cc",
-          out_ops_file + "_internal.h", out_ops_file + "_internal.cc"
-      ],
-      srcs=srcs,
-      tools=[":" + tool] + tf_binary_additional_srcs(),
-      cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
-           "$(location :" + out_ops_file + ".cc) " +
-           str(include_internal_ops) + " " + api_def_args_str))
+    native.genrule(
+        name = name + "_genrule",
+        outs = [
+            out_ops_file + ".h",
+            out_ops_file + ".cc",
+            out_ops_file + "_internal.h",
+            out_ops_file + "_internal.cc",
+        ],
+        srcs = srcs,
+        tools = [":" + tool] + tf_binary_additional_srcs(),
+        cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
+               "$(location :" + out_ops_file + ".cc) " +
+               str(include_internal_ops) + " " + api_def_args_str),
+    )
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
@@ -484,68 +501,72 @@
 #                     "ops/math_ops_internal.h" ],
 #            deps = [ ... ])
 # TODO(joshl): Cleaner approach for hidden ops.
-def tf_gen_op_wrappers_cc(name,
-                          op_lib_names=[],
-                          other_srcs=[],
-                          other_hdrs=[],
-                          pkg="",
-                          deps=[
-                              clean_dep("//tensorflow/cc:ops"),
-                              clean_dep("//tensorflow/cc:scope"),
-                              clean_dep("//tensorflow/cc:const_op"),
-                          ],
-                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
-                          include_internal_ops=0,
-                          visibility=None,
-                          # ApiDefs will be loaded in the order apecified in this list.
-                          api_def_srcs=[]):
-  subsrcs = other_srcs[:]
-  subhdrs = other_hdrs[:]
-  internalsrcs = []
-  internalhdrs = []
-  for n in op_lib_names:
-    tf_gen_op_wrapper_cc(
-        n,
-        "ops/" + n,
-        pkg=pkg,
-        op_gen=op_gen,
-        include_internal_ops=include_internal_ops,
-        api_def_srcs=api_def_srcs)
-    subsrcs += ["ops/" + n + ".cc"]
-    subhdrs += ["ops/" + n + ".h"]
-    internalsrcs += ["ops/" + n + "_internal.cc"]
-    internalhdrs += ["ops/" + n + "_internal.h"]
+def tf_gen_op_wrappers_cc(
+        name,
+        op_lib_names = [],
+        other_srcs = [],
+        other_hdrs = [],
+        pkg = "",
+        deps = [
+            clean_dep("//tensorflow/cc:ops"),
+            clean_dep("//tensorflow/cc:scope"),
+            clean_dep("//tensorflow/cc:const_op"),
+        ],
+        op_gen = clean_dep("//tensorflow/cc:cc_op_gen_main"),
+        include_internal_ops = 0,
+        visibility = None,
+        # ApiDefs will be loaded in the order apecified in this list.
+        api_def_srcs = []):
+    subsrcs = other_srcs[:]
+    subhdrs = other_hdrs[:]
+    internalsrcs = []
+    internalhdrs = []
+    for n in op_lib_names:
+        tf_gen_op_wrapper_cc(
+            n,
+            "ops/" + n,
+            pkg = pkg,
+            op_gen = op_gen,
+            include_internal_ops = include_internal_ops,
+            api_def_srcs = api_def_srcs,
+        )
+        subsrcs += ["ops/" + n + ".cc"]
+        subhdrs += ["ops/" + n + ".h"]
+        internalsrcs += ["ops/" + n + "_internal.cc"]
+        internalhdrs += ["ops/" + n + "_internal.h"]
 
-  native.cc_library(
-      name=name,
-      srcs=subsrcs,
-      hdrs=subhdrs,
-      deps=deps + if_not_android([
-          clean_dep("//tensorflow/core:core_cpu"),
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib"),
-          clean_dep("//tensorflow/core:protos_all_cc"),
-      ]) + if_android([
-          clean_dep("//tensorflow/core:android_tensorflow_lib"),
-      ]),
-      copts=tf_copts(),
-      alwayslink=1,
-      visibility=visibility)
-  native.cc_library(
-      name=name + "_internal",
-      srcs=internalsrcs,
-      hdrs=internalhdrs,
-      deps=deps + if_not_android([
-          clean_dep("//tensorflow/core:core_cpu"),
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib"),
-          clean_dep("//tensorflow/core:protos_all_cc"),
-      ]) + if_android([
-          clean_dep("//tensorflow/core:android_tensorflow_lib"),
-      ]),
-      copts=tf_copts(),
-      alwayslink=1,
-      visibility=[clean_dep("//tensorflow:internal")])
+    native.cc_library(
+        name = name,
+        srcs = subsrcs,
+        hdrs = subhdrs,
+        deps = deps + if_not_android([
+            clean_dep("//tensorflow/core:core_cpu"),
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = visibility,
+    )
+    native.cc_library(
+        name = name + "_internal",
+        srcs = internalsrcs,
+        hdrs = internalhdrs,
+        deps = deps + if_not_android([
+            clean_dep("//tensorflow/core:core_cpu"),
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = [clean_dep("//tensorflow:internal")],
+    )
 
 # Generates a Python library target wrapping the ops registered in "deps".
 #
@@ -571,96 +592,102 @@
 #     is invalid to specify both "hidden" and "op_whitelist".
 #   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
 #     specified ops.
-def tf_gen_op_wrapper_py(name,
-                         out=None,
-                         hidden=None,
-                         visibility=None,
-                         deps=[],
-                         require_shape_functions=False,
-                         hidden_file=None,
-                         generated_target_name=None,
-                         op_whitelist=[],
-                         cc_linkopts=[],
-                         api_def_srcs=[]):
-  if (hidden or hidden_file) and op_whitelist:
-    fail('Cannot pass specify both hidden and op_whitelist.')
+def tf_gen_op_wrapper_py(
+        name,
+        out = None,
+        hidden = None,
+        visibility = None,
+        deps = [],
+        require_shape_functions = False,
+        hidden_file = None,
+        generated_target_name = None,
+        op_whitelist = [],
+        cc_linkopts = [],
+        api_def_srcs = []):
+    if (hidden or hidden_file) and op_whitelist:
+        fail("Cannot pass specify both hidden and op_whitelist.")
 
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=if_not_windows(["-lm"]) + cc_linkopts,
-      copts=tf_copts(),
-      linkstatic=1,  # Faster to link this one-time-use binary dynamically
-      deps=([
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/python:python_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//tensorflow:internal")],)
+    # Construct a cc_binary containing the specified ops.
+    tool_name = "gen_" + name + "_py_wrappers_cc"
+    if not deps:
+        deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
+    tf_cc_binary(
+        name = tool_name,
+        linkopts = if_not_windows(["-lm"]) + cc_linkopts,
+        copts = tf_copts(),
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        deps = ([
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/python:python_op_gen_main"),
+        ] + deps),
+        visibility = [clean_dep("//tensorflow:internal")],
+    )
 
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "ops/gen_" + name + ".py"
+    # Invoke the previous cc_binary to generate a python file.
+    if not out:
+        out = "ops/gen_" + name + ".py"
 
-  if hidden:
-    op_list_arg = ",".join(hidden)
-    op_list_is_whitelist = False
-  elif op_whitelist:
-    op_list_arg = ",".join(op_whitelist)
-    op_list_is_whitelist = True
-  else:
-    op_list_arg = "''"
-    op_list_is_whitelist = False
+    if hidden:
+        op_list_arg = ",".join(hidden)
+        op_list_is_whitelist = False
+    elif op_whitelist:
+        op_list_arg = ",".join(op_whitelist)
+        op_list_is_whitelist = True
+    else:
+        op_list_arg = "''"
+        op_list_is_whitelist = False
 
-  # Prepare ApiDef directories to pass to the genrule.
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
+    # Prepare ApiDef directories to pass to the genrule.
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
 
-  if hidden_file:
-    # `hidden_file` is file containing a list of op names to be hidden in the
-    # generated module.
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        srcs=api_def_srcs + [hidden_file],
-        tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + api_def_args_str +
-             " @$(location " + hidden_file + ") " +
-             ("1" if require_shape_functions else "0") + " > $@"))
-  else:
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        srcs=api_def_srcs,
-        tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
-             op_list_arg + " " +
-             ("1" if require_shape_functions else "0") + " " +
-             ("1" if op_list_is_whitelist else "0") + " > $@"))
+    if hidden_file:
+        # `hidden_file` is file containing a list of op names to be hidden in the
+        # generated module.
+        native.genrule(
+            name = name + "_pygenrule",
+            outs = [out],
+            srcs = api_def_srcs + [hidden_file],
+            tools = [tool_name] + tf_binary_additional_srcs(),
+            cmd = ("$(location " + tool_name + ") " + api_def_args_str +
+                   " @$(location " + hidden_file + ") " +
+                   ("1" if require_shape_functions else "0") + " > $@"),
+        )
+    else:
+        native.genrule(
+            name = name + "_pygenrule",
+            outs = [out],
+            srcs = api_def_srcs,
+            tools = [tool_name] + tf_binary_additional_srcs(),
+            cmd = ("$(location " + tool_name + ") " + api_def_args_str + " " +
+                   op_list_arg + " " +
+                   ("1" if require_shape_functions else "0") + " " +
+                   ("1" if op_list_is_whitelist else "0") + " > $@"),
+        )
 
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
-      ],)
+    # Make a py_library out of the generated python file.
+    if not generated_target_name:
+        generated_target_name = name
+    native.py_library(
+        name = generated_target_name,
+        srcs = [out],
+        srcs_version = "PY2AND3",
+        visibility = visibility,
+        deps = [
+            clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
+        ],
+    )
 
 # Define a bazel macro that creates cc_test for tensorflow.
 #
@@ -671,53 +698,54 @@
 #
 # TODO(opensource): we need to enable this to work around the hidden symbol
 # __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name,
-               srcs,
-               deps,
-               data=[],
-               linkstatic=0,
-               extra_copts=[],
-               suffix="",
-               linkopts=[],
-               nocopts=None,
-               kernels=[],
-               **kwargs):
-  native.cc_test(
-      name="%s%s" % (name, suffix),
-      srcs=srcs + tf_binary_additional_srcs(),
-      copts=tf_copts() + extra_copts,
-      linkopts=select({
-        clean_dep("//tensorflow:android"): [
-            "-pie",
-        ],
-        clean_dep("//tensorflow:windows"): [],
-        clean_dep("//tensorflow:windows_msvc"): [],
-        clean_dep("//tensorflow:darwin"): [
-            "-lm",
-        ],
-        "//conditions:default": [
-            "-lpthread",
-            "-lm"
-        ],
-      }) + linkopts + _rpath_linkopts(name),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      # Nested select() statements seem not to be supported when passed to
-      # linkstatic, and we already have a cuda select() passed in to this
-      # function.
-      linkstatic=linkstatic or select({
-          # cc_tests with ".so"s in srcs incorrectly link on Darwin unless
-          # linkstatic=1 (https://github.com/bazelbuild/bazel/issues/3450).
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "//conditions:default": 0,
-      }),
-      nocopts=nocopts,
-      **kwargs)
+def tf_cc_test(
+        name,
+        srcs,
+        deps,
+        data = [],
+        linkstatic = 0,
+        extra_copts = [],
+        suffix = "",
+        linkopts = [],
+        nocopts = None,
+        kernels = [],
+        **kwargs):
+    native.cc_test(
+        name = "%s%s" % (name, suffix),
+        srcs = srcs + tf_binary_additional_srcs(),
+        copts = tf_copts() + extra_copts,
+        linkopts = select({
+            clean_dep("//tensorflow:android"): [
+                "-pie",
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            clean_dep("//tensorflow:darwin"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                "//third_party/mkl:intel_binary_blob",
+            ],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        # Nested select() statements seem not to be supported when passed to
+        # linkstatic, and we already have a cuda select() passed in to this
+        # function.
+        linkstatic = linkstatic or select({
+            # cc_tests with ".so"s in srcs incorrectly link on Darwin unless
+            # linkstatic=1 (https://github.com/bazelbuild/bazel/issues/3450).
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "//conditions:default": 0,
+        }),
+        nocopts = nocopts,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_test",
@@ -726,107 +754,115 @@
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name,
-                   srcs,
-                   deps,
-                   linkstatic=0,
-                   tags=[],
-                   data=[],
-                   size="medium",
-                   suffix="",
-                   args=None):
-  tf_cc_test(
-      name,
-      srcs,
-      deps,
-      linkstatic=linkstatic,
-      tags=tags,
-      data=data,
-      size=size,
-      suffix=suffix,
-      args=args)
+def tf_cc_test_gpu(
+        name,
+        srcs,
+        deps,
+        linkstatic = 0,
+        tags = [],
+        data = [],
+        size = "medium",
+        suffix = "",
+        args = None):
+    tf_cc_test(
+        name,
+        srcs,
+        deps,
+        linkstatic = linkstatic,
+        tags = tags,
+        data = data,
+        size = size,
+        suffix = suffix,
+        args = args,
+    )
 
 register_extension_info(
     extension_name = "tf_cc_test_gpu",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_cc_test(name,
-                    srcs=[],
-                    deps=[],
-                    tags=[],
-                    data=[],
-                    size="medium",
-                    extra_copts=[],
-                    linkstatic=0,
-                    args=[],
-                    linkopts=[]):
-  tf_cc_test(
-      name=name,
-      srcs=srcs,
-      deps=deps,
-      tags=tags + ["manual"],
-      data=data,
-      size=size,
-      extra_copts=extra_copts,
-      linkstatic=linkstatic,
-      linkopts=linkopts,
-      args=args)
-  tf_cc_test(
-      name=name,
-      srcs=srcs,
-      suffix="_gpu",
-      deps=deps + if_cuda([
-          clean_dep("//tensorflow/core:gpu_runtime"),
-      ]),
-      linkstatic=select({
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "@local_config_cuda//cuda:using_nvcc": 1,
-          "@local_config_cuda//cuda:using_clang": 1,
-          "//conditions:default": 0,
-      }),
-      tags=tags + tf_cuda_tests_tags(),
-      data=data,
-      size=size,
-      extra_copts=extra_copts,
-      linkopts=linkopts,
-      args=args)
+def tf_cuda_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        extra_copts = [],
+        linkstatic = 0,
+        args = [],
+        linkopts = []):
+    tf_cc_test(
+        name = name,
+        srcs = srcs,
+        deps = deps,
+        tags = tags + ["manual"],
+        data = data,
+        size = size,
+        extra_copts = extra_copts,
+        linkstatic = linkstatic,
+        linkopts = linkopts,
+        args = args,
+    )
+    tf_cc_test(
+        name = name,
+        srcs = srcs,
+        suffix = "_gpu",
+        deps = deps + if_cuda([
+            clean_dep("//tensorflow/core:gpu_runtime"),
+        ]),
+        linkstatic = select({
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "@local_config_cuda//cuda:using_nvcc": 1,
+            "@local_config_cuda//cuda:using_clang": 1,
+            "//conditions:default": 0,
+        }),
+        tags = tags + tf_cuda_tests_tags(),
+        data = data,
+        size = size,
+        extra_copts = extra_copts,
+        linkopts = linkopts,
+        args = args,
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_cc_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_only_cc_test(name,
-                    srcs=[],
-                    deps=[],
-                    tags=[],
-                    data=[],
-                    size="medium",
-                    linkstatic=0,
-                    args=[],
-                    kernels=[],
-                    linkopts=[]):
-  native.cc_test(
-      name="%s%s" % (name, "_gpu"),
-      srcs=srcs + tf_binary_additional_srcs(),
-      size=size,
-      args=args,
-      copts= _cuda_copts() + tf_copts(),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
-          clean_dep("//tensorflow/core:cuda"),
-          clean_dep("//tensorflow/core:gpu_lib")]),
-      linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
-      linkstatic=linkstatic or select({
-          # cc_tests with ".so"s in srcs incorrectly link on Darwin
-          # unless linkstatic=1.
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "//conditions:default": 0,
-      }),
-      tags=tags + tf_cuda_tests_tags())
+def tf_cuda_only_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        linkstatic = 0,
+        args = [],
+        kernels = [],
+        linkopts = []):
+    native.cc_test(
+        name = "%s%s" % (name, "_gpu"),
+        srcs = srcs + tf_binary_additional_srcs(),
+        size = size,
+        args = args,
+        copts = _cuda_copts() + tf_copts(),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
+            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]),
+        linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
+        linkstatic = linkstatic or select({
+            # cc_tests with ".so"s in srcs incorrectly link on Darwin
+            # unless linkstatic=1.
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "//conditions:default": 0,
+        }),
+        tags = tags + tf_cuda_tests_tags(),
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_only_cc_test",
@@ -834,109 +870,112 @@
 )
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs,
-                deps,
-                name="",
-                linkstatic=0,
-                tags=[],
-                size="medium",
-                args=None,
-                linkopts=[],
-                nocopts=None):
-  for src in srcs:
-    tf_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        linkstatic=linkstatic,
-        tags=tags,
-        size=size,
-        args=args,
-        linkopts=linkopts,
-        nocopts=nocopts)
+def tf_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        args = None,
+        linkopts = [],
+        nocopts = None):
+    for src in srcs:
+        tf_cc_test(
+            name = src_to_test_name(src),
+            srcs = [src],
+            deps = deps,
+            linkstatic = linkstatic,
+            tags = tags,
+            size = size,
+            args = args,
+            linkopts = linkopts,
+            nocopts = nocopts,
+        )
 
-def tf_cc_test_mkl(srcs,
-                   deps,
-                   name="",
-                   data=[],
-                   linkstatic=0,
-                   tags=[],
-                   size="medium",
-                   kernels=[],
-                   args=None):
-  # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
-  disable_header_modules = ["-use_header_modules"]
+def tf_cc_test_mkl(
+        srcs,
+        deps,
+        name = "",
+        data = [],
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        kernels = [],
+        args = None):
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
 
-  for src in srcs:
-    native.cc_test(
-      name=src_to_test_name(src),
-      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
-      copts=tf_copts(),
-      linkopts=select({
-        clean_dep("//tensorflow:android"): [
-            "-pie",
-          ],
-        clean_dep("//tensorflow:windows"): [],
-        clean_dep("//tensorflow:windows_msvc"): [],
-        "//conditions:default": [
-            "-lpthread",
-            "-lm"
-        ],
-      }) + _rpath_linkopts(src_to_test_name(src)),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      linkstatic=linkstatic,
-      tags=tags,
-      size=size,
-      args=args,
-      features=disable_header_modules,
-      nocopts="-fno-exceptions")
+    for src in srcs:
+        native.cc_test(
+            name = src_to_test_name(src),
+            srcs = if_mkl([src]) + tf_binary_additional_srcs(),
+            copts = tf_copts(),
+            linkopts = select({
+                clean_dep("//tensorflow:android"): [
+                    "-pie",
+                ],
+                clean_dep("//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-lpthread",
+                    "-lm",
+                ],
+            }) + _rpath_linkopts(src_to_test_name(src)),
+            deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
+            data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            linkstatic = linkstatic,
+            tags = tags,
+            size = size,
+            args = args,
+            features = disable_header_modules,
+            nocopts = "-fno-exceptions",
+        )
 
+def tf_cc_tests_gpu(
+        srcs,
+        deps,
+        name = "",
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        args = None):
+    tf_cc_tests(srcs, deps, linkstatic, tags = tags, size = size, args = args)
 
-def tf_cc_tests_gpu(srcs,
-                    deps,
-                    name="",
-                    linkstatic=0,
-                    tags=[],
-                    size="medium",
-                    args=None):
-  tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
+def tf_cuda_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        tags = [],
+        size = "medium",
+        linkstatic = 0,
+        args = None,
+        linkopts = []):
+    for src in srcs:
+        tf_cuda_cc_test(
+            name = src_to_test_name(src),
+            srcs = [src],
+            deps = deps,
+            tags = tags,
+            size = size,
+            linkstatic = linkstatic,
+            args = args,
+            linkopts = linkopts,
+        )
 
-def tf_cuda_cc_tests(srcs,
-                     deps,
-                     name="",
-                     tags=[],
-                     size="medium",
-                     linkstatic=0,
-                     args=None,
-                     linkopts=[]):
-  for src in srcs:
-    tf_cuda_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        tags=tags,
-        size=size,
-        linkstatic=linkstatic,
-        args=args,
-        linkopts=linkopts)
-
-def tf_java_test(name,
-                 srcs=[],
-                 deps=[],
-                 kernels=[],
-                 *args,
-                 **kwargs):
-  native.java_test(
-      name=name,
-      srcs=srcs,
-      deps=deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
-      *args,
-      **kwargs)
+def tf_java_test(
+        name,
+        srcs = [],
+        deps = [],
+        kernels = [],
+        *args,
+        **kwargs):
+    native.java_test(
+        name = name,
+        srcs = srcs,
+        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
+        *args,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_java_test",
@@ -944,85 +983,89 @@
 )
 
 def _cuda_copts():
-  """Gets the appropriate set of copts for (maybe) CUDA compilation.
+    """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
-    If we're doing CUDA compilation, returns copts for our particular CUDA
-    compiler.  If we're not doing CUDA compilation, returns an empty list.
+      If we're doing CUDA compilation, returns copts for our particular CUDA
+      compiler.  If we're not doing CUDA compilation, returns an empty list.
 
-    """
-  return cuda_default_copts() + select({
-      "//conditions:default": [],
-      "@local_config_cuda//cuda:using_nvcc": ([
-          "-nvcc_options=relaxed-constexpr",
-          "-nvcc_options=ftz=true",
-      ]),
-      "@local_config_cuda//cuda:using_clang": ([
-          "-fcuda-flush-denormals-to-zero",
-      ]),
-  })
+      """
+    return cuda_default_copts() + select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": ([
+            "-nvcc_options=relaxed-constexpr",
+            "-nvcc_options=ftz=true",
+        ]),
+        "@local_config_cuda//cuda:using_clang": ([
+            "-fcuda-flush-denormals-to-zero",
+        ]),
+    })
 
 # Build defs for TensorFlow kernels
 
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs,
-                          copts=[],
-                          cuda_copts=[],
-                          deps=[],
-                          hdrs=[],
-                          **kwargs):
-  copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
-  kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
+def tf_gpu_kernel_library(
+        srcs,
+        copts = [],
+        cuda_copts = [],
+        deps = [],
+        hdrs = [],
+        **kwargs):
+    copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
+    kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
 
-  native.cc_library(
-      srcs=srcs,
-      hdrs=hdrs,
-      copts=copts,
-      deps=deps + if_cuda([
-          clean_dep("//tensorflow/core:cuda"),
-          clean_dep("//tensorflow/core:gpu_lib"),
-      ]),
-      alwayslink=1,
-      **kwargs)
+    native.cc_library(
+        srcs = srcs,
+        hdrs = hdrs,
+        copts = copts,
+        deps = deps + if_cuda([
+            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]),
+        alwayslink = 1,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_gpu_kernel_library",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
-  """Generate a cc_library with a conditional set of CUDA dependencies.
+def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
+    """Generate a cc_library with a conditional set of CUDA dependencies.
 
-  When the library is built with --config=cuda:
+    When the library is built with --config=cuda:
 
-  - Both deps and cuda_deps are used as dependencies.
-  - The cuda runtime is added as a dependency (if necessary).
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
-  - In addition, when the library is also built with TensorRT enabled, it
-      additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
+    - Both deps and cuda_deps are used as dependencies.
+    - The cuda runtime is added as a dependency (if necessary).
+    - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+    - In addition, when the library is also built with TensorRT enabled, it
+        additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
 
-  Args:
-  - cuda_deps: BUILD dependencies which will be linked if and only if:
-      '--config=cuda' is passed to the bazel command line.
-  - deps: dependencies which will always be linked.
-  - copts: copts always passed to the cc_library.
-  - kwargs: Any other argument to cc_library.
-  """
-  if not deps:
-    deps = []
-  if not cuda_deps:
-    cuda_deps = []
+    Args:
+    - cuda_deps: BUILD dependencies which will be linked if and only if:
+        '--config=cuda' is passed to the bazel command line.
+    - deps: dependencies which will always be linked.
+    - copts: copts always passed to the cc_library.
+    - kwargs: Any other argument to cc_library.
+    """
+    if not deps:
+        deps = []
+    if not cuda_deps:
+        cuda_deps = []
 
-  kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
-  native.cc_library(
-      deps=deps + if_cuda(cuda_deps + [
-          clean_dep("//tensorflow/core:cuda"),
-          "@local_config_cuda//cuda:cuda_headers"
-      ]),
-      copts=(copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
-             if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
-      **kwargs)
+    kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
+    native.cc_library(
+        deps = deps + if_cuda(cuda_deps + [
+            clean_dep("//tensorflow/core:cuda"),
+            "@local_config_cuda//cuda:cuda_headers",
+        ]),
+        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+                 if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
+                 if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_library",
@@ -1040,126 +1083,138 @@
         copts = None,
         is_external = False,
         **kwargs):
-  """A rule to build a TensorFlow OpKernel.
+    """A rule to build a TensorFlow OpKernel.
 
-  May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
-  but with alwayslink=1 by default.  If prefix is specified:
-    * prefix*.cc (except *.cu.cc) is added to srcs
-    * prefix*.h (except *.cu.h) is added to hdrs
-    * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
-  With the exception that test files are excluded.
-  For example, with prefix = "cast_op",
-    * srcs = ["cast_op.cc"]
-    * hdrs = ["cast_op.h"]
-    * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
-    * "cast_op_test.cc" is excluded
-  With prefix = "cwise_op"
-    * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
-    * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
-    * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
-                  "cwise_ops.h", "cwise_ops_common.h",
-                  "cwise_ops_gpu_common.cu.h"]
-    * "cwise_ops_test.cc" is excluded
-  """
-  if not srcs:
-    srcs = []
-  if not hdrs:
-    hdrs = []
-  if not deps:
-    deps = []
-  if not copts:
-    copts = []
-  textual_hdrs = []
-  copts = copts + tf_copts(is_external=is_external)
-  if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
-      if not gpu_srcs:
-        gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob(
-          [prefix + "*.cu.cc", prefix + "*.h"], exclude=[prefix + "*test*"])
-    srcs = srcs + native.glob(
-        [prefix + "*.cc"], exclude=[prefix + "*test*", prefix + "*.cu.cc"])
-    hdrs = hdrs + native.glob(
+    May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
+    but with alwayslink=1 by default.  If prefix is specified:
+      * prefix*.cc (except *.cu.cc) is added to srcs
+      * prefix*.h (except *.cu.h) is added to hdrs
+      * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
+    With the exception that test files are excluded.
+    For example, with prefix = "cast_op",
+      * srcs = ["cast_op.cc"]
+      * hdrs = ["cast_op.h"]
+      * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
+      * "cast_op_test.cc" is excluded
+    With prefix = "cwise_op"
+      * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
+      * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
+      * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
+                    "cwise_ops.h", "cwise_ops_common.h",
+                    "cwise_ops_gpu_common.cu.h"]
+      * "cwise_ops_test.cc" is excluded
+    """
+    if not srcs:
+        srcs = []
+    if not hdrs:
+        hdrs = []
+    if not deps:
+        deps = []
+    if not copts:
+        copts = []
+    textual_hdrs = []
+    copts = copts + tf_copts(is_external = is_external)
+    if prefix:
+        if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+            if not gpu_srcs:
+                gpu_srcs = []
+            gpu_srcs = gpu_srcs + native.glob(
+                [prefix + "*.cu.cc", prefix + "*.h"],
+                exclude = [prefix + "*test*"],
+            )
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+            exclude = [prefix + "*test*", prefix + "*.cu.cc"],
+        )
+        hdrs = hdrs + native.glob(
             [prefix + "*.h"],
             exclude = [prefix + "*test*", prefix + "*.cu.h", prefix + "*impl.h"],
         )
-    textual_hdrs = native.glob(
+        textual_hdrs = native.glob(
             [prefix + "*impl.h"],
             exclude = [prefix + "*test*", prefix + "*.cu.h"],
         )
-  cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
-  if gpu_srcs:
-    for gpu_src in gpu_srcs:
-      if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".
-             format(gpu_src))
-    tf_gpu_kernel_library(
-        name=name + "_gpu", srcs=gpu_srcs, deps=deps, **kwargs)
-    cuda_deps.extend([":" + name + "_gpu"])
-  kwargs["tags"] = kwargs.get("tags", []) + [
-      "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
-      "req_dep=@local_config_cuda//cuda:cuda_headers",
-  ]
-  tf_cuda_library(
-      name=name,
-      srcs=srcs,
-      hdrs=hdrs,
-      textual_hdrs = textual_hdrs,
-      copts=copts,
-      cuda_deps=cuda_deps,
-      linkstatic=1,  # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink=alwayslink,
-      deps=deps,
-      **kwargs)
+    cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
+    if gpu_srcs:
+        for gpu_src in gpu_srcs:
+            if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
+                fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc"
+                    .format(gpu_src))
+        tf_gpu_kernel_library(
+            name = name + "_gpu",
+            srcs = gpu_srcs,
+            deps = deps,
+            **kwargs
+        )
+        cuda_deps.extend([":" + name + "_gpu"])
+    kwargs["tags"] = kwargs.get("tags", []) + [
+        "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
+        "req_dep=@local_config_cuda//cuda:cuda_headers",
+    ]
+    tf_cuda_library(
+        name = name,
+        srcs = srcs,
+        hdrs = hdrs,
+        textual_hdrs = textual_hdrs,
+        copts = copts,
+        cuda_deps = cuda_deps,
+        linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+        alwayslink = alwayslink,
+        deps = deps,
+        **kwargs
+    )
 
-  # TODO(gunan): CUDA dependency not clear here. Fix it.
-  tf_cc_shared_object(
-      name="libtfkernel_%s.so" % name,
-      srcs=srcs + hdrs,
-      copts=copts,
-      deps=deps,
-      tags=["manual", "notap"])
-
+    # TODO(gunan): CUDA dependency not clear here. Fix it.
+    tf_cc_shared_object(
+        name = "libtfkernel_%s.so" % name,
+        srcs = srcs + hdrs,
+        copts = copts,
+        deps = deps,
+        tags = ["manual", "notap"],
+    )
 
 register_extension_info(
     extension_name = "tf_kernel_library",
     label_regex_for_dep = "{extension_name}(_gpu)?",
 )
 
-def tf_mkl_kernel_library(name,
-                          prefix=None,
-                          srcs=None,
-                          hdrs=None,
-                          deps=None,
-                          alwayslink=1,
-                          copts=tf_copts(),
-                          nocopts="-fno-exceptions"):
-  """A rule to build MKL-based TensorFlow kernel libraries."""
+def tf_mkl_kernel_library(
+        name,
+        prefix = None,
+        srcs = None,
+        hdrs = None,
+        deps = None,
+        alwayslink = 1,
+        copts = tf_copts(),
+        nocopts = "-fno-exceptions"):
+    """A rule to build MKL-based TensorFlow kernel libraries."""
 
-  if not bool(srcs):
-    srcs = []
-  if not bool(hdrs):
-    hdrs = []
+    if not bool(srcs):
+        srcs = []
+    if not bool(hdrs):
+        hdrs = []
 
-  if prefix:
-    srcs = srcs + native.glob(
-        [prefix + "*.cc"])
-    hdrs = hdrs + native.glob(
-        [prefix + "*.h"])
+    if prefix:
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+        )
+        hdrs = hdrs + native.glob(
+            [prefix + "*.h"],
+        )
 
-  # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
-  disable_header_modules = ["-use_header_modules"]
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
 
-  native.cc_library(
-      name=name,
-      srcs=if_mkl(srcs),
-      hdrs=hdrs,
-      deps=deps,
-      alwayslink=alwayslink,
-      copts=copts,
-      nocopts=nocopts,
-      features = disable_header_modules
-  )
+    native.cc_library(
+        name = name,
+        srcs = if_mkl(srcs),
+        hdrs = hdrs,
+        deps = deps,
+        alwayslink = alwayslink,
+        copts = copts,
+        nocopts = nocopts,
+        features = disable_header_modules,
+    )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
@@ -1168,35 +1223,42 @@
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
-  srcs = ctx.files.srcs
-  if len(srcs) != 1:
-    fail("Exactly one SWIG source file label must be specified.", "srcs")
-  module_name = ctx.attr.module_name
-  src = ctx.files.srcs[0]
-  inputs = depset([src])
-  inputs += ctx.files.swig_includes
-  for dep in ctx.attr.deps:
-    inputs += dep.cc.transitive_headers
-  inputs += ctx.files._swiglib
-  inputs += ctx.files.toolchain_deps
-  swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
-  swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = [
-      "-c++", "-python", "-module", module_name, "-o", ctx.outputs.cc_out.path,
-      "-outdir", ctx.outputs.py_out.dirname
-  ]
-  args += ["-l" + f.path for f in ctx.files.swig_includes]
-  args += ["-I" + i for i in swig_include_dirs]
-  args += [src.path]
-  outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
-  ctx.action(
-      executable=ctx.executable._swig,
-      arguments=args,
-      inputs=list(inputs),
-      outputs=outputs,
-      mnemonic="PythonSwig",
-      progress_message="SWIGing " + src.path)
-  return struct(files=depset(outputs))
+    srcs = ctx.files.srcs
+    if len(srcs) != 1:
+        fail("Exactly one SWIG source file label must be specified.", "srcs")
+    module_name = ctx.attr.module_name
+    src = ctx.files.srcs[0]
+    inputs = depset([src])
+    inputs += ctx.files.swig_includes
+    for dep in ctx.attr.deps:
+        inputs += dep.cc.transitive_headers
+    inputs += ctx.files._swiglib
+    inputs += ctx.files.toolchain_deps
+    swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
+    swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
+    args = [
+        "-c++",
+        "-python",
+        "-module",
+        module_name,
+        "-o",
+        ctx.outputs.cc_out.path,
+        "-outdir",
+        ctx.outputs.py_out.dirname,
+    ]
+    args += ["-l" + f.path for f in ctx.files.swig_includes]
+    args += ["-I" + i for i in swig_include_dirs]
+    args += [src.path]
+    outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
+    ctx.action(
+        executable = ctx.executable._swig,
+        arguments = args,
+        inputs = list(inputs),
+        outputs = outputs,
+        mnemonic = "PythonSwig",
+        progress_message = "SWIGing " + src.path,
+    )
+    return struct(files = depset(outputs))
 
 _py_wrap_cc = rule(
     attrs = {
@@ -1234,40 +1296,40 @@
 )
 
 def _get_repository_roots(ctx, files):
-  """Returns abnormal root directories under which files reside.
+    """Returns abnormal root directories under which files reside.
 
-  When running a ctx.action, source files within the main repository are all
-  relative to the current directory; however, files that are generated or exist
-  in remote repositories will have their root directory be a subdirectory,
-  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
-  returns the set of these devious directories, ranked and sorted by popularity
-  in order to hopefully minimize the number of I/O system calls within the
-  compiler, because includes have quadratic complexity.
-  """
-  result = {}
-  for f in files:
-    root = f.root.path
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-    work = f.owner.workspace_root
-    if work:
-      if root:
-        root += "/"
-      root += work
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
+    When running a ctx.action, source files within the main repository are all
+    relative to the current directory; however, files that are generated or exist
+    in remote repositories will have their root directory be a subdirectory,
+    e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+    returns the set of these devious directories, ranked and sorted by popularity
+    in order to hopefully minimize the number of I/O system calls within the
+    compiler, because includes have quadratic complexity.
+    """
+    result = {}
+    for f in files:
+        root = f.root.path
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+        work = f.owner.workspace_root
+        if work:
+            if root:
+                root += "/"
+            root += work
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+    return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
-  outputs = depset()
-  for dep in ctx.attr.deps:
-    outputs += dep.cc.transitive_headers
-  return struct(files=outputs)
+    outputs = depset()
+    for dep in ctx.attr.deps:
+        outputs += dep.cc.transitive_headers
+    return struct(files = outputs)
 
 _transitive_hdrs = rule(
     attrs = {
@@ -1279,52 +1341,54 @@
     implementation = _transitive_hdrs_impl,
 )
 
-def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather", deps=deps)
-  native.filegroup(name=name, srcs=[":" + name + "_gather"])
+def transitive_hdrs(name, deps = [], **kwargs):
+    _transitive_hdrs(name = name + "_gather", deps = deps)
+    native.filegroup(name = name, srcs = [":" + name + "_gather"])
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
-def cc_header_only_library(name, deps=[], includes=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather", deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    includes=includes,
-                    **kwargs)
+def cc_header_only_library(name, deps = [], includes = [], **kwargs):
+    _transitive_hdrs(name = name + "_gather", deps = deps)
+    native.cc_library(
+        name = name,
+        hdrs = [":" + name + "_gather"],
+        includes = includes,
+        **kwargs
+    )
 
 def tf_custom_op_library_additional_deps():
-  return [
+    return [
       "@protobuf_archive//:protobuf_headers",
-      clean_dep("//third_party/eigen3"),
-      clean_dep("//tensorflow/core:framework_headers_lib"),
-  ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
+        clean_dep("//third_party/eigen3"),
+        clean_dep("//tensorflow/core:framework_headers_lib"),
+    ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
 
 # A list of targets that contains the implemenation of
 # tf_custom_op_library_additional_deps. It's used to generate a DEF file for
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
-  return [
+    return [
       "@protobuf_archive//:protobuf",
       "@nsync//:nsync_cpp",
-      # for //third_party/eigen3
-      clean_dep("//third_party/eigen3"),
-      # for //tensorflow/core:framework_headers_lib
-      clean_dep("//tensorflow/core:framework"),
-      clean_dep("//tensorflow/core:reader_base"),
-  ]
+        # for //third_party/eigen3
+        clean_dep("//third_party/eigen3"),
+        # for //tensorflow/core:framework_headers_lib
+        clean_dep("//tensorflow/core:framework"),
+        clean_dep("//tensorflow/core:reader_base"),
+    ]
 
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
 # and the tf_collected_deps of the dependencies of this target.
 def _collect_deps_aspect_impl(target, ctx):
-  alldeps = depset()
-  if hasattr(ctx.rule.attr, "deps"):
-    for dep in ctx.rule.attr.deps:
-      alldeps = alldeps | depset([dep.label])
-      if hasattr(dep, "tf_collected_deps"):
-        alldeps = alldeps | dep.tf_collected_deps
-  return struct(tf_collected_deps=alldeps)
+    alldeps = depset()
+    if hasattr(ctx.rule.attr, "deps"):
+        for dep in ctx.rule.attr.deps:
+            alldeps = alldeps | depset([dep.label])
+            if hasattr(dep, "tf_collected_deps"):
+                alldeps = alldeps | dep.tf_collected_deps
+    return struct(tf_collected_deps = alldeps)
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps"],
@@ -1332,24 +1396,26 @@
 )
 
 def _dep_label(dep):
-  label = dep.label
-  return label.package + ":" + label.name
+    label = dep.label
+    return label.package + ":" + label.name
 
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
 def _check_deps_impl(ctx):
-  disallowed_deps = ctx.attr.disallowed_deps
-  for input_dep in ctx.attr.deps:
-    if not hasattr(input_dep, "tf_collected_deps"):
-      continue
-    for dep in input_dep.tf_collected_deps:
-      for disallowed_dep in disallowed_deps:
-        if dep == disallowed_dep.label:
-          fail(
-              _dep_label(input_dep) + " cannot depend on " + _dep_label(
-                  disallowed_dep))
-  return struct()
+    disallowed_deps = ctx.attr.disallowed_deps
+    for input_dep in ctx.attr.deps:
+        if not hasattr(input_dep, "tf_collected_deps"):
+            continue
+        for dep in input_dep.tf_collected_deps:
+            for disallowed_dep in disallowed_deps:
+                if dep == disallowed_dep.label:
+                    fail(
+                        _dep_label(input_dep) + " cannot depend on " + _dep_label(
+                            disallowed_dep,
+                        ),
+                    )
+    return struct()
 
 check_deps = rule(
     _check_deps_impl,
@@ -1368,66 +1434,70 @@
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
-  cuda_deps = [
-      clean_dep("//tensorflow/core:stream_executor_headers_lib"),
-      "@local_config_cuda//cuda:cuda_headers",
-      "@local_config_cuda//cuda:cudart_static",
-  ]
-  deps = deps + tf_custom_op_library_additional_deps()
-  if gpu_srcs:
-    basename = name.split(".")[0]
-    native.cc_library(
-        name=basename + "_gpu",
-        srcs=gpu_srcs,
-        copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
-        features = if_cuda(["-use_header_modules"]),
-        deps=deps + if_cuda(cuda_deps))
-    cuda_deps.extend([":" + basename + "_gpu"])
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = []):
+    cuda_deps = [
+        clean_dep("//tensorflow/core:stream_executor_headers_lib"),
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart_static",
+    ]
+    deps = deps + tf_custom_op_library_additional_deps()
+    if gpu_srcs:
+        basename = name.split(".")[0]
+        native.cc_library(
+            name = basename + "_gpu",
+            srcs = gpu_srcs,
+            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            features = if_cuda(["-use_header_modules"]),
+            deps = deps + if_cuda(cuda_deps),
+        )
+        cuda_deps.extend([":" + basename + "_gpu"])
 
-  check_deps(
-      name=name + "_check_deps",
-      deps=deps + if_cuda(cuda_deps),
-      disallowed_deps=[
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib")
-      ])
-  tf_cc_shared_object(
-      name=name,
-      srcs=srcs,
-      deps=deps + if_cuda(cuda_deps),
-      data=if_static([name + "_check_deps"]),
-      copts=tf_copts(is_external=True),
-      features = ["windows_export_all_symbols"],
-      linkopts=linkopts + select({
-          "//conditions:default": [
-              "-lm",
-          ],
-          clean_dep("//tensorflow:windows"): [],
-          clean_dep("//tensorflow:windows_msvc"): [],
-          clean_dep("//tensorflow:darwin"): [],
-      }),)
+    check_deps(
+        name = name + "_check_deps",
+        deps = deps + if_cuda(cuda_deps),
+        disallowed_deps = [
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+        ],
+    )
+    tf_cc_shared_object(
+        name = name,
+        srcs = srcs,
+        deps = deps + if_cuda(cuda_deps),
+        data = if_static([name + "_check_deps"]),
+        copts = tf_copts(is_external = True),
+        features = ["windows_export_all_symbols"],
+        linkopts = linkopts + select({
+            "//conditions:default": [
+                "-lm",
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            clean_dep("//tensorflow:darwin"): [],
+        }),
+    )
 
 register_extension_info(
     extension_name = "tf_custom_op_library",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_custom_op_py_library(name,
-                            srcs=[],
-                            dso=[],
-                            kernels=[],
-                            srcs_version="PY2AND3",
-                            visibility=None,
-                            deps=[]):
-  kernels = kernels  # unused argument
-  native.py_library(
-      name=name,
-      data=dso,
-      srcs=srcs,
-      srcs_version=srcs_version,
-      visibility=visibility,
-      deps=deps,)
+def tf_custom_op_py_library(
+        name,
+        srcs = [],
+        dso = [],
+        kernels = [],
+        srcs_version = "PY2AND3",
+        visibility = None,
+        deps = []):
+    kernels = kernels  # unused argument
+    native.py_library(
+        name = name,
+        data = dso,
+        srcs = srcs,
+        srcs_version = srcs_version,
+        visibility = visibility,
+        deps = deps,
+    )
 
 register_extension_info(
     extension_name = "tf_custom_op_py_library",
@@ -1441,119 +1511,127 @@
 # This function attempts to append init_module_name to list of
 # exported functions in version script
 def _append_init_to_versionscript_impl(ctx):
-  mod_name = ctx.attr.module_name
-  if ctx.attr.is_version_script:
-    ctx.actions.expand_template(
-      template=ctx.file.template_file,
-      output=ctx.outputs.versionscript,
-      substitutions={
-        "global:":"global:\n     init_%s;\n     PyInit_*;"%(mod_name),
-      },
-      is_executable=False,
-    )
-  else:
-    ctx.actions.expand_template(
-      template=ctx.file.template_file,
-      output=ctx.outputs.versionscript,
-      substitutions={
-        "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(mod_name),
-      },
-      is_executable=False,
-    )
+    mod_name = ctx.attr.module_name
+    if ctx.attr.is_version_script:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "global:": "global:\n     init_%s;\n     PyInit_*;" % (mod_name),
+            },
+            is_executable = False,
+        )
+    else:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "*tensorflow*": "*tensorflow*\ninit_%s\nPyInit_*\n" % (mod_name),
+            },
+            is_executable = False,
+        )
 
-
-_append_init_to_versionscript= rule(
-  implementation=_append_init_to_versionscript_impl,
-  attrs={
-    "module_name":attr.string(mandatory=True),
-    "template_file":attr.label(allow_files=True,single_file=True,mandatory=True),
-    "is_version_script":attr.bool(default=True,
-      doc='whether target is a ld version script or exported symbol list',
-      mandatory=False),
-  },
-  outputs={"versionscript":"%{name}.lds"},
+_append_init_to_versionscript = rule(
+    implementation = _append_init_to_versionscript_impl,
+    attrs = {
+        "module_name": attr.string(mandatory = True),
+        "template_file": attr.label(allow_files = True, single_file = True, mandatory = True),
+        "is_version_script": attr.bool(
+            default = True,
+            doc = "whether target is a ld version script or exported symbol list",
+            mandatory = False,
+        ),
+    },
+    outputs = {"versionscript": "%{name}.lds"},
 )
 
-def tf_py_wrap_cc(name,
-                             srcs,
-                             swig_includes=[],
-                             deps=[],
-                             copts=[],
-                             **kwargs):
-  module_name = name.split("/")[-1]
-  # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
-  # and use that as the name for the rule producing the .so file.
-  cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(
-      name.split("/")[:-1] + ["_" + module_name + ".pyd"])
-  extra_deps = []
-  _py_wrap_cc(
-      name=name + "_py_wrap",
-      srcs=srcs,
-      swig_includes=swig_includes,
-      deps=deps + extra_deps,
-      toolchain_deps=["@bazel_tools//tools/cpp:current_cc_toolchain"],
-      module_name=module_name,
-      py_module_name=name)
-  vscriptname=name+"_versionscript"
-  _append_init_to_versionscript(
-      name=vscriptname,
-      module_name=module_name,
-      is_version_script=select({
-          "@local_config_cuda//cuda:darwin":False,
-          "//conditions:default":True,
-          }),
-      template_file=select({
-          "@local_config_cuda//cuda:darwin":clean_dep("//tensorflow:tf_exported_symbols.lds"),
-          "//conditions:default":clean_dep("//tensorflow:tf_version_script.lds")
-      })
-  )
-  extra_linkopts = select({
-      "@local_config_cuda//cuda:darwin": [
-          "-Wl,-exported_symbols_list",
-          "$(location %s.lds)"%vscriptname,
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      clean_dep("//tensorflow:windows_msvc"): [],
-      "//conditions:default": [
-          "-Wl,--version-script",
-          "$(location %s.lds)"%vscriptname,
-      ]
-  })
-  extra_deps += select({
-      "@local_config_cuda//cuda:darwin": [
-          "%s.lds"%vscriptname,
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      clean_dep("//tensorflow:windows_msvc"): [],
-      "//conditions:default": [
-          "%s.lds"%vscriptname,
-      ]
-  })
+def tf_py_wrap_cc(
+        name,
+        srcs,
+        swig_includes = [],
+        deps = [],
+        copts = [],
+        **kwargs):
+    module_name = name.split("/")[-1]
 
-  tf_cc_shared_object(
-      name=cc_library_name,
-      srcs=[module_name + ".cc"],
-      copts=copts + if_not_windows([
-          "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
-      ]),
-      linkopts=extra_linkopts,
-      linkstatic=1,
-      deps=deps + extra_deps,
-      **kwargs)
-  native.genrule(
-      name="gen_" + cc_library_pyd_name,
-      srcs=[":" + cc_library_name],
-      outs=[cc_library_pyd_name],
-      cmd="cp $< $@",)
-  native.py_library(
-      name=name,
-      srcs=[":" + name + ".py"],
-      srcs_version="PY2AND3",
-      data=select({
-          clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
-          "//conditions:default": [":" + cc_library_name],
-      }))
+    # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
+    # and use that as the name for the rule producing the .so file.
+    cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
+    cc_library_pyd_name = "/".join(
+        name.split("/")[:-1] + ["_" + module_name + ".pyd"],
+    )
+    extra_deps = []
+    _py_wrap_cc(
+        name = name + "_py_wrap",
+        srcs = srcs,
+        swig_includes = swig_includes,
+        deps = deps + extra_deps,
+        toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
+        module_name = module_name,
+        py_module_name = name,
+    )
+    vscriptname = name + "_versionscript"
+    _append_init_to_versionscript(
+        name = vscriptname,
+        module_name = module_name,
+        is_version_script = select({
+            "@local_config_cuda//cuda:darwin": False,
+            "//conditions:default": True,
+        }),
+        template_file = select({
+            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
+        }),
+    )
+    extra_linkopts = select({
+        "@local_config_cuda//cuda:darwin": [
+            "-Wl,-exported_symbols_list",
+            "$(location %s.lds)" % vscriptname,
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,--version-script",
+            "$(location %s.lds)" % vscriptname,
+        ],
+    })
+    extra_deps += select({
+        "@local_config_cuda//cuda:darwin": [
+            "%s.lds" % vscriptname,
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "%s.lds" % vscriptname,
+        ],
+    })
+
+    tf_cc_shared_object(
+        name = cc_library_name,
+        srcs = [module_name + ".cc"],
+        copts = copts + if_not_windows([
+            "-Wno-self-assign",
+            "-Wno-sign-compare",
+            "-Wno-write-strings",
+        ]),
+        linkopts = extra_linkopts,
+        linkstatic = 1,
+        deps = deps + extra_deps,
+        **kwargs
+    )
+    native.genrule(
+        name = "gen_" + cc_library_pyd_name,
+        srcs = [":" + cc_library_name],
+        outs = [cc_library_pyd_name],
+        cmd = "cp $< $@",
+    )
+    native.py_library(
+        name = name,
+        srcs = [":" + name + ".py"],
+        srcs_version = "PY2AND3",
+        data = select({
+            clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
+            "//conditions:default": [":" + cc_library_name],
+        }),
+    )
 
 # This macro is for running python tests against system installed pip package
 # on Windows.
@@ -1571,246 +1649,263 @@
 #    Note that this only works on Windows. See the definition of
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
-def py_test(deps=[], data=[], **kwargs):
-  native.py_test(
-      # TODO(jlebar): Ideally we'd use tcmalloc here.,
-      deps=select({
-          "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
-      }),
-      data = data + select({
-          "//conditions:default": [],
-          clean_dep("//tensorflow:no_tensorflow_py_deps"):
-          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
-      }),
-      **kwargs)
+def py_test(deps = [], data = [], **kwargs):
+    native.py_test(
+        # TODO(jlebar): Ideally we'd use tcmalloc here.,
+        deps = select({
+            "//conditions:default": deps,
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+        }),
+        data = data + select({
+            "//conditions:default": [],
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
+        }),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "py_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_py_test(name,
-               srcs,
-               size="medium",
-               data=[],
-               main=None,
-               args=[],
-               tags=[],
-               shard_count=1,
-               additional_deps=[],
-               flaky=0,
-               xla_enabled=False,
-               grpc_enabled=False):
-  if xla_enabled:
-    additional_deps = additional_deps + tf_additional_xla_deps_py()
-  if grpc_enabled:
-    additional_deps = additional_deps + tf_additional_grpc_deps_py()
-  py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      main=main,
-      args=args,
-      tags=tags,
-      visibility=[clean_dep("//tensorflow:internal")],
-      shard_count=shard_count,
-      data=data,
-      deps=[
+def tf_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        tags = [],
+        shard_count = 1,
+        additional_deps = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    if xla_enabled:
+        additional_deps = additional_deps + tf_additional_xla_deps_py()
+    if grpc_enabled:
+        additional_deps = additional_deps + tf_additional_grpc_deps_py()
+    py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        main = main,
+        args = args,
+        tags = tags,
+        visibility = [clean_dep("//tensorflow:internal")],
+        shard_count = shard_count,
+        data = data,
+        deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
-          ] + additional_deps,
-      flaky=flaky,
-      srcs_version="PY2AND3")
+        ] + additional_deps,
+        flaky = flaky,
+        srcs_version = "PY2AND3",
+    )
 
 register_extension_info(
     extension_name = "tf_py_test",
     label_regex_map = {"additional_deps": "deps:{extension_name}"},
 )
 
-def cuda_py_test(name,
-                 srcs,
-                 size="medium",
-                 data=[],
-                 main=None,
-                 args=[],
-                 shard_count=1,
-                 additional_deps=[],
-                 tags=[],
-                 flaky=0,
-                 xla_enabled=False,
-                 grpc_enabled=False):
-  test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      data=data,
-      main=main,
-      args=args,
-      tags=test_tags,
-      shard_count=shard_count,
-      additional_deps=additional_deps,
-      flaky=flaky,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def cuda_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        additional_deps = [],
+        tags = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_cuda_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "cuda_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
 )
 
-def sycl_py_test(name,
-                 srcs,
-                 size="medium",
-                 data=[],
-                 main=None,
-                 args=[],
-                 shard_count=1,
-                 additional_deps=[],
-                 tags=[],
-                 flaky=0,
-                 xla_enabled=False,
-                 grpc_enabled=False):
-  test_tags = tags + tf_sycl_tests_tags()
-  tf_py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      data=data,
-      main=main,
-      args=args,
-      tags=test_tags,
-      shard_count=shard_count,
-      additional_deps=additional_deps,
-      flaky=flaky,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def sycl_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        additional_deps = [],
+        tags = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_sycl_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "sycl_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
 )
 
-def py_tests(name,
-             srcs,
-             size="medium",
-             additional_deps=[],
-             data=[],
-             tags=[],
-             shard_count=1,
-             prefix="",
-             xla_enabled=False,
-             grpc_enabled=False):
-  for src in srcs:
-    test_name = src.split("/")[-1].split(".")[0]
-    if prefix:
-      test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(
-        name=test_name,
-        size=size,
-        srcs=[src],
-        main=src,
-        tags=tags,
-        shard_count=shard_count,
-        data=data,
-        additional_deps=additional_deps,
-        xla_enabled=xla_enabled,
-        grpc_enabled=grpc_enabled)
+def py_tests(
+        name,
+        srcs,
+        size = "medium",
+        additional_deps = [],
+        data = [],
+        tags = [],
+        shard_count = 1,
+        prefix = "",
+        xla_enabled = False,
+        grpc_enabled = False):
+    for src in srcs:
+        test_name = src.split("/")[-1].split(".")[0]
+        if prefix:
+            test_name = "%s_%s" % (prefix, test_name)
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = [src],
+            main = src,
+            tags = tags,
+            shard_count = shard_count,
+            data = data,
+            additional_deps = additional_deps,
+            xla_enabled = xla_enabled,
+            grpc_enabled = grpc_enabled,
+        )
 
-def cuda_py_tests(name,
-                  srcs,
-                  size="medium",
-                  additional_deps=[],
-                  data=[],
-                  shard_count=1,
-                  tags=[],
-                  prefix="",
-                  xla_enabled=False,
-                  grpc_enabled=False):
-  test_tags = tags + tf_cuda_tests_tags()
-  py_tests(
-      name=name,
-      size=size,
-      srcs=srcs,
-      additional_deps=additional_deps,
-      data=data,
-      tags=test_tags,
-      shard_count=shard_count,
-      prefix=prefix,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def cuda_py_tests(
+        name,
+        srcs,
+        size = "medium",
+        additional_deps = [],
+        data = [],
+        shard_count = 1,
+        tags = [],
+        prefix = "",
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_cuda_tests_tags()
+    py_tests(
+        name = name,
+        size = size,
+        srcs = srcs,
+        additional_deps = additional_deps,
+        data = data,
+        tags = test_tags,
+        shard_count = shard_count,
+        prefix = prefix,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps=[], deps=[], visibility=None):
-  out_hdrs = (
-      [p.replace(".proto", ".pb_text.h")
-       for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
-  out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
-  native.genrule(
-      name=name + "_srcs",
-      srcs=srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
-      outs=out_hdrs + out_srcs,
-      visibility=visibility,
-      cmd=
-      "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
-      + "$(@D) " + srcs_relative_dir + " $(SRCS)",
-      tools=[
-          clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
-      ],)
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None):
+    out_hdrs = (
+        [
+            p.replace(".proto", ".pb_text.h")
+            for p in srcs
+        ] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs]
+    )
+    out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
+    native.genrule(
+        name = name + "_srcs",
+        srcs = srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+        outs = out_hdrs + out_srcs,
+        visibility = visibility,
+        cmd =
+            "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
+            "$(@D) " + srcs_relative_dir + " $(SRCS)",
+        tools = [
+            clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions"),
+        ],
+    )
 
-  native.filegroup(
-      name=name + "_hdrs",
-      srcs=out_hdrs,
-      visibility=visibility,
-  )
+    native.filegroup(
+        name = name + "_hdrs",
+        srcs = out_hdrs,
+        visibility = visibility,
+    )
 
-  native.cc_library(
-      name=name,
-      srcs=out_srcs,
-      hdrs=out_hdrs,
-      visibility=visibility,
-      deps = deps,
-  )
+    native.cc_library(
+        name = name,
+        srcs = out_srcs,
+        hdrs = out_hdrs,
+        visibility = visibility,
+        deps = deps,
+    )
 
 def tf_genrule_cmd_append_to_srcs(to_append):
-  return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
-          " >> $(@)")
+    return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+            " >> $(@)")
 
 def tf_version_info_genrule():
-  native.genrule(
-      name="version_info_gen",
-      srcs=[
-          clean_dep("@local_config_git//:gen/spec.json"),
-          clean_dep("@local_config_git//:gen/head"),
-          clean_dep("@local_config_git//:gen/branch_ref"),
-      ],
-      outs=["util/version_info.cc"],
-      cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
-      local=1,
-      tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
+    native.genrule(
+        name = "version_info_gen",
+        srcs = [
+            clean_dep("@local_config_git//:gen/spec.json"),
+            clean_dep("@local_config_git//:gen/head"),
+            clean_dep("@local_config_git//:gen/branch_ref"),
+        ],
+        outs = ["util/version_info.cc"],
+        cmd =
+            "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
+        local = 1,
+        tools = [clean_dep("//tensorflow/tools/git:gen_git_source.py")],
+    )
 
 def tf_py_build_info_genrule():
-  native.genrule(
-      name="py_build_info_gen",
-      outs=["platform/build_info.py"],
-      cmd=
-     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
-      local=1,
-      tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
+    native.genrule(
+        name = "py_build_info_gen",
+        outs = ["platform/build_info.py"],
+        cmd =
+            "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+        local = 1,
+        tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],
+    )
 
-def cc_library_with_android_deps(deps,
-                                 android_deps=[],
-                                 common_deps=[],
-                                 copts=tf_copts(),
-                                 **kwargs):
-  deps = if_not_android(deps) + if_android(android_deps) + common_deps
-  native.cc_library(deps=deps, copts=copts, **kwargs)
+def cc_library_with_android_deps(
+        deps,
+        android_deps = [],
+        common_deps = [],
+        copts = tf_copts(),
+        **kwargs):
+    deps = if_not_android(deps) + if_android(android_deps) + common_deps
+    native.cc_library(deps = deps, copts = copts, **kwargs)
 
 register_extension_info(
     extension_name = "cc_library_with_android_deps",
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index ebdf42d..4389a99 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -7,6 +7,11 @@
 licenses(["notice"])  # Apache 2.0
 
 filegroup(
-    name = "api_golden",
-    srcs = glob(["*.pbtxt"]),
+    name = "api_golden_v1",
+    srcs = glob(["v1/*.pbtxt"]),
+)
+
+filegroup(
+    name = "api_golden_v2",
+    srcs = glob(["v2/*.pbtxt"]),
 )
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
deleted file mode 100644
index 42cb914..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.applications.densenet"
-tf_module {
-  member_method {
-    name: "DenseNet121"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet169"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet201"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt
deleted file mode 100644
index 211080c..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.inception_resnet_v2"
-tf_module {
-  member_method {
-    name: "InceptionResNetV2"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_v3.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_v3.pbtxt
deleted file mode 100644
index b67cee8..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_v3.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.inception_v3"
-tf_module {
-  member_method {
-    name: "InceptionV3"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.mobilenet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.mobilenet.pbtxt
deleted file mode 100644
index ef774e1..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.mobilenet.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.mobilenet"
-tf_module {
-  member_method {
-    name: "MobileNet"
-    argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
deleted file mode 100644
index cd75b87..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.applications.nasnet"
-tf_module {
-  member_method {
-    name: "NASNetLarge"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetMobile"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
deleted file mode 100644
index 9fc086e..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
+++ /dev/null
@@ -1,87 +0,0 @@
-path: "tensorflow.keras.applications"
-tf_module {
-  member {
-    name: "densenet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "inception_resnet_v2"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "inception_v3"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "mobilenet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "nasnet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "resnet50"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "vgg16"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "vgg19"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "xception"
-    mtype: "<type \'module\'>"
-  }
-  member_method {
-    name: "DenseNet121"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet169"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet201"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "InceptionResNetV2"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "InceptionV3"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "MobileNet"
-    argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetLarge"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetMobile"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "ResNet50"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "VGG16"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "VGG19"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "Xception"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt
deleted file mode 100644
index 7385af0..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.resnet50"
-tf_module {
-  member_method {
-    name: "ResNet50"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt
deleted file mode 100644
index ba66fba..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.vgg16"
-tf_module {
-  member_method {
-    name: "VGG16"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt
deleted file mode 100644
index e55a134..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.vgg19"
-tf_module {
-  member_method {
-    name: "VGG19"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.xception.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.xception.pbtxt
deleted file mode 100644
index 59dd210..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.xception.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.xception"
-tf_module {
-  member_method {
-    name: "Xception"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
deleted file mode 100644
index dddace8..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.DirectoryIterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.DirectoryIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
deleted file mode 100644
index c1e2e94..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.ImageDataGenerator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.ImageDataGenerator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'brightness_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\', \'validation_split\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'0.0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'augment\', \'rounds\', \'seed\'], varargs=None, keywords=None, defaults=[\'False\', \'1\', \'None\'], "
-  }
-  member_method {
-    name: "flow"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'None\'], "
-  }
-  member_method {
-    name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "random_transform"
-    argspec: "args=[\'self\', \'x\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "standardize"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
deleted file mode 100644
index 825d9f1..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'n\', \'batch_size\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
deleted file mode 100644
index 75924a2..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.NumpyArrayIterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.NumpyArrayIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\', \'None\'], "
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
deleted file mode 100644
index 6b850dd..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.keras.preprocessing.image"
-tf_module {
-  member {
-    name: "DirectoryIterator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ImageDataGenerator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Iterator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NumpyArrayIterator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "apply_transform"
-    argspec: "args=[\'x\', \'transform_matrix\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "array_to_img"
-    argspec: "args=[\'x\', \'data_format\', \'scale\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
-  }
-  member_method {
-    name: "flip_axis"
-    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "img_to_array"
-    argspec: "args=[\'img\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "load_img"
-    argspec: "args=[\'path\', \'grayscale\', \'target_size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "random_brightness"
-    argspec: "args=[\'x\', \'brightness_range\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "random_channel_shift"
-    argspec: "args=[\'x\', \'intensity\', \'channel_axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
-  member_method {
-    name: "random_rotation"
-    argspec: "args=[\'x\', \'rg\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_shear"
-    argspec: "args=[\'x\', \'intensity\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_shift"
-    argspec: "args=[\'x\', \'wrg\', \'hrg\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_zoom"
-    argspec: "args=[\'x\', \'zoom_range\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.pbtxt
deleted file mode 100644
index 5a78581..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.preprocessing"
-tf_module {
-  member {
-    name: "image"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "sequence"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "text"
-    mtype: "<type \'module\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
deleted file mode 100644
index 326b1fa..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.sequence.TimeseriesGenerator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'data\', \'targets\', \'length\', \'sampling_rate\', \'stride\', \'start_index\', \'end_index\', \'shuffle\', \'reverse\', \'batch_size\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'None\', \'False\', \'False\', \'128\'], "
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
deleted file mode 100644
index cf59f8a..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.preprocessing.sequence"
-tf_module {
-  member {
-    name: "TimeseriesGenerator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "make_sampling_table"
-    argspec: "args=[\'size\', \'sampling_factor\'], varargs=None, keywords=None, defaults=[\'1e-05\'], "
-  }
-  member_method {
-    name: "pad_sequences"
-    argspec: "args=[\'sequences\', \'maxlen\', \'dtype\', \'padding\', \'truncating\', \'value\'], varargs=None, keywords=None, defaults=[\'None\', \'int32\', \'pre\', \'pre\', \'0.0\'], "
-  }
-  member_method {
-    name: "skipgrams"
-    argspec: "args=[\'sequence\', \'vocabulary_size\', \'window_size\', \'negative_samples\', \'shuffle\', \'categorical\', \'sampling_table\', \'seed\'], varargs=None, keywords=None, defaults=[\'4\', \'1.0\', \'True\', \'False\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
deleted file mode 100644
index b42b12b..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ /dev/null
@@ -1,33 +0,0 @@
-path: "tensorflow.keras.preprocessing.text.Tokenizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.text.Tokenizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\', \'oov_token\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fit_on_sequences"
-    argspec: "args=[\'self\', \'sequences\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "fit_on_texts"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "sequences_to_matrix"
-    argspec: "args=[\'self\', \'sequences\', \'mode\'], varargs=None, keywords=None, defaults=[\'binary\'], "
-  }
-  member_method {
-    name: "texts_to_matrix"
-    argspec: "args=[\'self\', \'texts\', \'mode\'], varargs=None, keywords=None, defaults=[\'binary\'], "
-  }
-  member_method {
-    name: "texts_to_sequences"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "texts_to_sequences_generator"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
deleted file mode 100644
index 50b54fc..0000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.preprocessing.text"
-tf_module {
-  member {
-    name: "Tokenizer"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "hashing_trick"
-    argspec: "args=[\'text\', \'n\', \'hash_function\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-  member_method {
-    name: "one_hot"
-    argspec: "args=[\'text\', \'n\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-  member_method {
-    name: "text_to_word_sequence"
-    argspec: "args=[\'text\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-attr-value.-list-value.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-attr-value.-list-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-attr-value.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-attr-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-conditional-accumulator-base.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-conditional-accumulator-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-conditional-accumulator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-conditional-accumulator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-device-count-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-device-count-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
new file mode 100644
index 0000000..eb41dee
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.ConfigProto.Experimental"
+tf_proto {
+  descriptor {
+    name: "Experimental"
+    field {
+      name: "collective_group_leader"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "client_handles_error_formatting"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "executor_type"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
new file mode 100644
index 0000000..e565b90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -0,0 +1,148 @@
+path: "tensorflow.ConfigProto"
+tf_proto {
+  descriptor {
+    name: "ConfigProto"
+    field {
+      name: "device_count"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
+    }
+    field {
+      name: "intra_op_parallelism_threads"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "inter_op_parallelism_threads"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "use_per_session_threads"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "session_inter_op_thread_pool"
+      number: 12
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ThreadPoolOptionProto"
+    }
+    field {
+      name: "placement_period"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "device_filters"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "gpu_options"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GPUOptions"
+    }
+    field {
+      name: "allow_soft_placement"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "log_device_placement"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "graph_options"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphOptions"
+    }
+    field {
+      name: "operation_timeout_in_ms"
+      number: 11
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "rpc_options"
+      number: 13
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RPCOptions"
+    }
+    field {
+      name: "cluster_def"
+      number: 14
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ClusterDef"
+    }
+    field {
+      name: "isolate_session_state"
+      number: 15
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "experimental"
+      number: 16
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.Experimental"
+    }
+    nested_type {
+      name: "DeviceCountEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      options {
+        map_entry: true
+      }
+    }
+    nested_type {
+      name: "Experimental"
+      field {
+        name: "collective_group_leader"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "client_handles_error_formatting"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "executor_type"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-device-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-device-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-dimension.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-dimension.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-event.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-event.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-event.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-f-i-f-o-queue.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-f-i-f-o-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-length-record-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-fixed-length-record-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph-keys.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-graph-keys.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph-options.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-graph-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-histogram-proto.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-histogram-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-identity-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-identity-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-interactive-session.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-interactive-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-l-m-d-b-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-l-m-d-b-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-log-message.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-log-message.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-name-attr-list.-attr-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-name-attr-list.-attr-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-name-attr-list.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-name-attr-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-attr-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-node-def.-attr-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-op-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-op-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-optimizer-options.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-optimizer-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-padding-f-i-f-o-queue.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-padding-f-i-f-o-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-priority-queue.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-priority-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-queue-base.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-queue-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-random-shuffle-queue.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-random-shuffle-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-reader-base.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-reader-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-register-gradient.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-register-gradient.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-session-log.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-session-log.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-session.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-session.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-conditional-accumulator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-sparse-conditional-accumulator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary-metadata.-plugin-data.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary-metadata.-plugin-data.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary-metadata.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary.-audio.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary.-audio.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary.-image.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary.-image.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary.-value.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary.-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-summary.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-summary.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-t-f-record-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-t-f-record-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-array.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-tensor-array.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.-coo-sparse.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.-coo-sparse.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-tensor-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-text-line-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-text-line-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-aggregation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-variable-aggregation.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable-scope.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-variable-scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-synchronization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable-synchronization.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-variable-synchronization.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-variable-synchronization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-whole-file-reader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-whole-file-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.app.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.app.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.app.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.bitwise.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.bitwise.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.compat.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.constant_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.constant_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-dataset.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
new file mode 100644
index 0000000..4f0147a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.data.Iterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_string_handle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_structure"
+    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_next"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_initializer"
+    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_handle"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.data.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.debugging.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-bernoulli.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-bernoulli.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-beta.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-beta.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-categorical.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-categorical.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-dirichlet-multinomial.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-dirichlet-multinomial.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-dirichlet.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-dirichlet.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-distribution.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-distribution.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-exponential.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-exponential.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-gamma.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-gamma.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-laplace.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-laplace.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-multinomial.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-multinomial.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-register-k-l.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-register-k-l.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-reparameterization-type.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-reparameterization-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-student-t.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-student-t.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.-uniform.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.-uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distributions.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distributions.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.dtypes.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-aborted-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-aborted-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-already-exists-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-already-exists-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-cancelled-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-cancelled-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-data-loss-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-data-loss-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-deadline-exceeded-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-deadline-exceeded-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-failed-precondition-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-failed-precondition-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-internal-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-internal-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-invalid-argument-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-invalid-argument-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-not-found-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-not-found-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-op-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-op-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-out-of-range-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-out-of-range-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-permission-denied-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-permission-denied-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-resource-exhausted-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-resource-exhausted-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-unauthenticated-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-unauthenticated-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-unavailable-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-unavailable-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-unimplemented-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-unimplemented-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.-unknown-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.-unknown-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
new file mode 100644
index 0000000..c23b04b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.estimator.BoostedTreesClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
new file mode 100644
index 0000000..6878d28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.estimator.BoostedTreesRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-eval-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-eval-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-exporter.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-final-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-final-exporter.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-latest-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-latest-exporter.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
new file mode 100644
index 0000000..bf1f94b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
@@ -0,0 +1,105 @@
+path: "tensorflow.estimator.RunConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "eval_distribute"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "evaluation_master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_id_in_cluster"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_every_n_hours"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "log_step_count_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_ps_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_worker_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summary_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "service"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tf_random_seed"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "replace"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-train-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-train-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-vocab-info.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-vocab-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.inputs.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.inputs.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.gfile.-fast-g-file.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.gfile.-g-file.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.gfile.-open.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.gfile.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.gfile.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.graph_util.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
new file mode 100644
index 0000000..5c46dc5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -0,0 +1,251 @@
+path: "tensorflow.image"
+tf_module {
+  member {
+    name: "ResizeMethod"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "adjust_brightness"
+    argspec: "args=[\'image\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_contrast"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_gamma"
+    argspec: "args=[\'image\', \'gamma\', \'gain\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
+  }
+  member_method {
+    name: "adjust_hue"
+    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_jpeg_quality"
+    argspec: "args=[\'image\', \'jpeg_quality\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_saturation"
+    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "central_crop"
+    argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_image_dtype"
+    argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "crop_and_resize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "crop_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "draw_bounding_boxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "extract_glimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "extract_image_patches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "flip_left_right"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip_up_down"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "grayscale_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "hsv_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "image_gradients"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression_overlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression_padded"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "pad_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "per_image_standardization"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "psnr"
+    argspec: "args=[\'a\', \'b\', \'max_val\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_contrast"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_left_right"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_up_down"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_hue"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_jpeg_quality"
+    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_saturation"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "resize_area"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_image_with_crop_or_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_image_with_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "resize_images"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "resize_nearest_neighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "rgb_to_grayscale"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_hsv"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sobel_edges"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim_multiscale"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
+  }
+  member_method {
+    name: "total_variation"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose_image"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.constant.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.constant.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.identity.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.identity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.ones.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.ones.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.orthogonal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.orthogonal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.random_normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.random_normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.random_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.random_uniform.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.random_uniform.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.random_uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.truncated_normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.truncated_normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.truncated_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.uniform_unit_scaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.uniform_unit_scaling.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.uniform_unit_scaling.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.uniform_unit_scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.variance_scaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.variance_scaling.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.variance_scaling.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.variance_scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.initializers.zeros.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.initializers.zeros.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.initializers.zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.io.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
new file mode 100644
index 0000000..e579fe6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -0,0 +1,268 @@
+path: "tensorflow.keras.Model"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
new file mode 100644
index 0000000..97688fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -0,0 +1,285 @@
+path: "tensorflow.keras.Sequential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "pop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_classes"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_proba"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
new file mode 100644
index 0000000..2e9de9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.keras.activations"
+tf_module {
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "elu"
+    argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hard_sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linear"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "relu"
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "selu"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'activation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "softplus"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "softsign"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.name_scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.name_scope.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.backend.name_scope.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.backend.name_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.callbacks.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-constraint.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-max-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-min-max-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-non-neg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.-unit-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.max_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.min_max_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.non_neg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.constraints.unit_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.boston_housing.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.boston_housing.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar10.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.cifar10.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar10.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.cifar10.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar100.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.cifar100.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar100.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.cifar100.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.fashion_mnist.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.fashion_mnist.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.imdb.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.imdb.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.mnist.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.mnist.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.mnist.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.mnist.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.datasets.reuters.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-constant.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-constant.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-identity.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-identity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-ones.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-ones.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-orthogonal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-orthogonal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-uniform.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-truncated-normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-variance-scaling.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-variance-scaling.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-variance-scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-zeros.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.-zeros.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.constant.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.identity.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.ones.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.orthogonal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.zeros.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
similarity index 97%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 5d05cf6..2dff7a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -118,7 +118,7 @@
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index f754fa1..ff19dcc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -82,7 +82,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index c9516b8..3c278fe 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -82,7 +82,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1160d28..6718e36 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -61,6 +61,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
new file mode 100644
index 0000000..56914e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -0,0 +1,268 @@
+path: "tensorflow.keras.models.Model"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
new file mode 100644
index 0000000..acfb352
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -0,0 +1,285 @@
+path: "tensorflow.keras.models.Sequential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "pop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_classes"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_proba"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.models.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.optimizers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1-l2.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1-l2.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-regularizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-regularizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.regularizers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-custom-object-scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.wrappers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.wrappers.scikit_learn.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.layers.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.logging.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.logging.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.logging.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.losses.-reduction.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.losses.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.losses.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.losses.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.manip.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.math.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.metrics.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.name_scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.name_scope.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index c747730..e606eab 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index d251f54..5deb02d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index d76eab7..32fa151 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 944db6a..30c6c2c 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ones_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.ones_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.orthogonal_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.orthogonal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/tensorflow.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 5eb42b4..4de662f 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -785,6 +785,10 @@
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "batch_gather"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -1902,19 +1906,19 @@
   }
   member_method {
     name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reorder"
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.-checker.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.-checker.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-advice-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-graph-node-proto.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-graph-node-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-multi-graph-node-proto.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-multi-graph-node-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-op-log-proto.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-op-log-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-profile-option-builder.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-profile-option-builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.-profiler.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.-profiler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.profiler.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.profiler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-compression-type.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-compression-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-options.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-writer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.python_io.-t-f-record-writer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.python_io.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.python_io.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.quantization.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random_normal_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.random_normal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random_uniform_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.random_uniform_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.resource_loader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.resource_loader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.loader.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.loader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.main_op.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.main_op.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_constants.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.tag_constants.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.tag_constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.utils.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.saved_model.utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.sets.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.spectral.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.spectral.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
similarity index 92%
rename from tensorflow/tools/api/golden/tensorflow.strings.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 9a831fe..018be7b 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -5,6 +5,10 @@
     argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
+    name: "length"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-event.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-event.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-file-writer-cache.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-file-writer-cache.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-file-writer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-file-writer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-session-log.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-session-log.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-summary-description.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-summary-description.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-audio.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-audio.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-image.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-image.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-value.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-summary.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.-tagged-run-metadata.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.-tagged-run-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
similarity index 94%
rename from tensorflow/tools/api/golden/tensorflow.summary.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
index 871ebb5..7ed9cd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
@@ -50,7 +50,7 @@
   }
   member_method {
     name: "merge_all"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\'], "
+    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
   }
   member_method {
     name: "scalar"
diff --git a/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-stub-out-for-testing.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.test.-stub-out-for-testing.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.test.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-bytes-list.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-bytes-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-saver-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-saver-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-saver-listener.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-saver-listener.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
similarity index 84%
rename from tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
index 2d067e4..5be3720 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
@@ -20,4 +20,8 @@
     name: "save"
     argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-chief-session-creator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-chief-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-cluster-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-cluster-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-cluster-spec.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-cluster-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-coordinator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-coordinator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-example.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-example.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-exponential-moving-average.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-exponential-moving-average.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-feature-list.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-feature-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-feature-lists.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-feature-lists.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-feature.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-features.-feature-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-features.-feature-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-features.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-features.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-feed-fn-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-feed-fn-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-final-ops-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-final-ops-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-float-list.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-float-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-global-step-waiter-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-global-step-waiter-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-int64-list.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-int64-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-job-def.-tasks-entry.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-job-def.-tasks-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-job-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-job-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-logging-tensor-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-logging-tensor-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-monitored-session.-step-context.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-monitored-session.-step-context.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-monitored-session.-step-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-monitored-session.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-monitored-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-tensor-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-nan-tensor-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-profiler-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-profiler-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-profiler-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-queue-runner.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-queue-runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-saver-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-scaffold.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-scaffold.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-second-or-step-timer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-second-or-step-timer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sequence-example.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-sequence-example.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-server.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-server.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-creator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-manager.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-manager.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-context.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-singular-monitored-session.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-singular-monitored-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-step-counter-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-step-counter-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-stop-at-step-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-stop-at-step-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-summary-saver-hook.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-summary-saver-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-supervisor.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-supervisor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-worker-session-creator.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.-worker-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
similarity index 97%
rename from tensorflow/tools/api/golden/tensorflow.train.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index b0fb04d..9f35395 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -298,7 +298,7 @@
   }
   member_method {
     name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_checkpoint_mtimes"
@@ -446,7 +446,7 @@
   }
   member_method {
     name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "warm_start"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.queue_runner.-queue-runner.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.queue_runner.-queue-runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.queue_runner.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.train.queue_runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.truncated_normal_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.truncated_normal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.uniform_unit_scaling_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.uniform_unit_scaling_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.variable_scope.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.variable_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.variance_scaling_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.variance_scaling_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.zeros_initializer.pbtxt
similarity index 100%
rename from tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.zeros_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
new file mode 100644
index 0000000..eb41dee
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.ConfigProto.Experimental"
+tf_proto {
+  descriptor {
+    name: "Experimental"
+    field {
+      name: "collective_group_leader"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "client_handles_error_formatting"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "executor_type"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
new file mode 100644
index 0000000..e565b90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
@@ -0,0 +1,148 @@
+path: "tensorflow.ConfigProto"
+tf_proto {
+  descriptor {
+    name: "ConfigProto"
+    field {
+      name: "device_count"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
+    }
+    field {
+      name: "intra_op_parallelism_threads"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "inter_op_parallelism_threads"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "use_per_session_threads"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "session_inter_op_thread_pool"
+      number: 12
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ThreadPoolOptionProto"
+    }
+    field {
+      name: "placement_period"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "device_filters"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "gpu_options"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GPUOptions"
+    }
+    field {
+      name: "allow_soft_placement"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "log_device_placement"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "graph_options"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphOptions"
+    }
+    field {
+      name: "operation_timeout_in_ms"
+      number: 11
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "rpc_options"
+      number: 13
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RPCOptions"
+    }
+    field {
+      name: "cluster_def"
+      number: 14
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ClusterDef"
+    }
+    field {
+      name: "isolate_session_state"
+      number: 15
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "experimental"
+      number: 16
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.Experimental"
+    }
+    nested_type {
+      name: "DeviceCountEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      options {
+        map_entry: true
+      }
+    }
+    nested_type {
+      name: "Experimental"
+      field {
+        name: "collective_group_leader"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "client_handles_error_formatting"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "executor_type"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-event.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-length-record-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-fixed-length-record-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-identity-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-identity-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-interactive-session.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-interactive-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-l-m-d-b-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-l-m-d-b-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-op-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-op-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-padding-f-i-f-o-queue.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-padding-f-i-f-o-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-priority-queue.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-priority-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-queue-base.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-queue-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-random-shuffle-queue.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-random-shuffle-queue.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-reader-base.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-reader-base.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-register-gradient.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-register-gradient.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-session.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-conditional-accumulator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-sparse-conditional-accumulator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-t-f-record-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-t-f-record-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-array.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-tensor-array.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-text-line-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-text-line-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-aggregation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-variable-aggregation.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-variable-scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-synchronization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable-synchronization.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-variable-synchronization.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-variable-synchronization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-whole-file-reader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.-whole-file-reader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.app.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.bitwise.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.bitwise.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.compat.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-dataset.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
new file mode 100644
index 0000000..4f0147a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.data.Iterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_string_handle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_structure"
+    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_next"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_initializer"
+    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_handle"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.data.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.debugging.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.dtypes.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-aborted-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-aborted-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-already-exists-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-already-exists-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-cancelled-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-cancelled-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-data-loss-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-data-loss-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-deadline-exceeded-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-deadline-exceeded-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-failed-precondition-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-failed-precondition-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-internal-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-internal-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-invalid-argument-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-invalid-argument-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-not-found-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-not-found-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-op-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-op-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-out-of-range-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-out-of-range-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-permission-denied-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-permission-denied-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-resource-exhausted-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-resource-exhausted-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-unauthenticated-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-unauthenticated-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-unavailable-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-unavailable-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-unimplemented-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-unimplemented-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.-unknown-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.-unknown-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.errors.raise_exception_on_not_ok_status.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
new file mode 100644
index 0000000..c23b04b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.estimator.BoostedTreesClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
new file mode 100644
index 0000000..6878d28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.estimator.BoostedTreesRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-eval-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-eval-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-exporter.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-final-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-final-exporter.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-latest-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-latest-exporter.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
new file mode 100644
index 0000000..bf1f94b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
@@ -0,0 +1,105 @@
+path: "tensorflow.estimator.RunConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "eval_distribute"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "evaluation_master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_id_in_cluster"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_every_n_hours"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "log_step_count_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_ps_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_worker_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summary_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "service"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tf_random_seed"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "replace"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-train-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-train-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-vocab-info.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-vocab-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.inputs.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.inputs.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
new file mode 100644
index 0000000..5c46dc5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -0,0 +1,251 @@
+path: "tensorflow.image"
+tf_module {
+  member {
+    name: "ResizeMethod"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "adjust_brightness"
+    argspec: "args=[\'image\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_contrast"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_gamma"
+    argspec: "args=[\'image\', \'gamma\', \'gain\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
+  }
+  member_method {
+    name: "adjust_hue"
+    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_jpeg_quality"
+    argspec: "args=[\'image\', \'jpeg_quality\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_saturation"
+    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "central_crop"
+    argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_image_dtype"
+    argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "crop_and_resize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "crop_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "draw_bounding_boxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "extract_glimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "extract_image_patches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "flip_left_right"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip_up_down"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "grayscale_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "hsv_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "image_gradients"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression_overlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression_padded"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "pad_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "per_image_standardization"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "psnr"
+    argspec: "args=[\'a\', \'b\', \'max_val\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_contrast"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_left_right"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_up_down"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_hue"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_jpeg_quality"
+    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_saturation"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "resize_area"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "resize_image_with_crop_or_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_image_with_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "resize_images"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "resize_nearest_neighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "rgb_to_grayscale"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_hsv"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sobel_edges"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ssim_multiscale"
+    argspec: "args=[\'img1\', \'img2\', \'max_val\', \'power_factors\'], varargs=None, keywords=None, defaults=[\'(0.0448, 0.2856, 0.3001, 0.2363, 0.1333)\'], "
+  }
+  member_method {
+    name: "total_variation"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose_image"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.constant.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.identity.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.ones.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.orthogonal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.random_normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.random_uniform.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.truncated_normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.uniform_unit_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.uniform_unit_scaling.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.variance_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.variance_scaling.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.initializers.zeros.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.io.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
new file mode 100644
index 0000000..e579fe6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -0,0 +1,268 @@
+path: "tensorflow.keras.Model"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
new file mode 100644
index 0000000..97688fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -0,0 +1,285 @@
+path: "tensorflow.keras.Sequential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "pop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_classes"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_proba"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
new file mode 100644
index 0000000..2e9de9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.keras.activations"
+tf_module {
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "elu"
+    argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hard_sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linear"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "relu"
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "selu"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'activation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "softplus"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "softsign"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.name_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.name_scope.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.backend.name_scope.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.backend.name_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.callbacks.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-constraint.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-max-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-min-max-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-non-neg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.-unit-norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.max_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.min_max_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.non_neg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.constraints.unit_norm.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.boston_housing.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.boston_housing.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar10.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.cifar10.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar10.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.cifar10.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar100.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.cifar100.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.cifar100.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.cifar100.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.fashion_mnist.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.fashion_mnist.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.imdb.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.imdb.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.mnist.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.mnist.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.mnist.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.mnist.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.datasets.reuters.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-constant.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-identity.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-ones.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-orthogonal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-random-uniform.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-truncated-normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-variance-scaling.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.-zeros.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
similarity index 97%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 5d05cf6..2dff7a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -118,7 +118,7 @@
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
similarity index 95%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index f754fa1..ff19dcc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -82,7 +82,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
similarity index 95%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index c9516b8..3c278fe 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -82,7 +82,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
similarity index 98%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1160d28..6718e36 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -61,6 +61,10 @@
     mtype: "<type \'property\'>"
   }
   member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
new file mode 100644
index 0000000..56914e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -0,0 +1,268 @@
+path: "tensorflow.keras.models.Model"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
new file mode 100644
index 0000000..acfb352
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -0,0 +1,285 @@
+path: "tensorflow.keras.models.Sequential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "pop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_classes"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_proba"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.models.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.optimizers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1-l2.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1-l2.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-regularizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-regularizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.regularizers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-custom-object-scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.wrappers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.keras.wrappers.scikit_learn.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-batch-normalization.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-batch-normalization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d-transpose.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d-transpose.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-dense.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-dense.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-dropout.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-dropout.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-flatten.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-flatten.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-input-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-input-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-layer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-layer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv1-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv1-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.layers.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-composition.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-diag.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-identity.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.logging.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.losses.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.manip.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.manip.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.manip.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.math.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.name_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.name_scope.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
similarity index 96%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index c747730..e606eab 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
similarity index 96%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index d251f54..5deb02d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
similarity index 96%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index d76eab7..32fa151 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
similarity index 95%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 944db6a..30c6c2c 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.orthogonal_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.orthogonal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
similarity index 98%
copy from tensorflow/tools/api/golden/tensorflow.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 5eb42b4..4de662f 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -785,6 +785,10 @@
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "batch_gather"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -1902,19 +1906,19 @@
   }
   member_method {
     name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reorder"
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-compression-type.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-compression-type.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-options.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-options.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-writer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-writer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.python_io.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.quantization.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.constants.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.loader.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.loader.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.main_op.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.main_op.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_constants.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_def_utils.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_def_utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.tag_constants.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.tag_constants.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.utils.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.saved_model.utils.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.sets.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
similarity index 92%
copy from tensorflow/tools/api/golden/tensorflow.strings.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 9a831fe..018be7b 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -5,6 +5,10 @@
     argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
+    name: "length"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
similarity index 94%
copy from tensorflow/tools/api/golden/tensorflow.summary.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 871ebb5..7ed9cd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -50,7 +50,7 @@
   }
   member_method {
     name: "merge_all"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\'], "
+    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
   }
   member_method {
     name: "scalar"
diff --git a/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.test.-stub-out-for-testing.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.test.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-bytes-list.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-bytes-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
similarity index 84%
copy from tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 2d067e4..5be3720 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -20,4 +20,8 @@
     name: "save"
     argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-cluster-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-cluster-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-cluster-spec.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-cluster-spec.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-coordinator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-coordinator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-example.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-example.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-exponential-moving-average.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-exponential-moving-average.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feature-list.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-feature-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feature-lists.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-feature-lists.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feature.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-feature.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-features.-feature-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-features.-feature-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-features.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-features.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-float-list.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-float-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-int64-list.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-int64-list.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-job-def.-tasks-entry.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-job-def.-tasks-entry.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-job-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-job-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-monitored-session.-step-context.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-profiler-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-queue-runner.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-queue-runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-saver-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-saver-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-saver.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-saver.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sequence-example.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-sequence-example.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-vocab-info.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
similarity index 97%
copy from tensorflow/tools/api/golden/tensorflow.train.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index b0fb04d..9f35395 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -298,7 +298,7 @@
   }
   member_method {
     name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_checkpoint_mtimes"
@@ -446,7 +446,7 @@
   }
   member_method {
     name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "warm_start"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.queue_runner.-queue-runner.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.queue_runner.-queue-runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.queue_runner.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.train.queue_runner.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.uniform_unit_scaling_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.uniform_unit_scaling_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.variable_scope.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.variance_scaling_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.variance_scaling_initializer.pbtxt
diff --git a/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
similarity index 100%
copy from tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
copy to tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 724b12c..8764409 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -17,7 +17,8 @@
     name = "api_compatibility_test",
     srcs = ["api_compatibility_test.py"],
     data = [
-        "//tensorflow/tools/api/golden:api_golden",
+        "//tensorflow/tools/api/golden:api_golden_v1",
+        "//tensorflow/tools/api/golden:api_golden_v2",
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
     ],
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index d1b34fb..43d19bc 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,13 +34,6 @@
 import unittest
 
 import tensorflow as tf
-# pylint: disable=g-import-not-at-top
-try:
-  from tensorflow.compat import v1 as tf_v1
-  # We import compat.v1 as tf_v1 instead.
-  del tf.compat.v1
-except ImportError:
-  tf_v1 = None
 
 from google.protobuf import message
 from google.protobuf import text_format
@@ -53,8 +46,6 @@
 from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
-# pylint: enable=g-import-not-at-top
-
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -70,19 +61,25 @@
      false, only print which libraries have differences.
 """
 
-_API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
+_API_GOLDEN_FOLDER_V1 = 'tensorflow/tools/api/golden/v1'
+_API_GOLDEN_FOLDER_V2 = 'tensorflow/tools/api/golden/v2'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
 
-def _KeyToFilePath(key):
-  """From a given key, construct a filepath."""
+def _KeyToFilePath(key, api_version):
+  """From a given key, construct a filepath.
+
+  Filepath will be inside golden folder for api_version.
+  """
   def _ReplaceCapsWithDash(matchobj):
     match = matchobj.group(0)
     return '-%s' % (match.lower())
 
   case_insensitive_key = re.sub('([A-Z]{1})', _ReplaceCapsWithDash, key)
-  return os.path.join(_API_GOLDEN_FOLDER, '%s.pbtxt' % case_insensitive_key)
+  api_folder = (
+      _API_GOLDEN_FOLDER_V2 if api_version == 2 else _API_GOLDEN_FOLDER_V1)
+  return os.path.join(api_folder, '%s.pbtxt' % case_insensitive_key)
 
 
 def _FileNameToKey(filename):
@@ -98,6 +95,21 @@
   return api_object_key
 
 
+def _VerifyNoSubclassOfMessageVisitor(path, parent, unused_children):
+  """A Visitor that crashes on subclasses of generated proto classes."""
+  # If the traversed object is a proto Message class
+  if not (isinstance(parent, type) and
+          issubclass(parent, message.Message)):
+    return
+  if parent is message.Message:
+    return
+  # Check that it is a direct subclass of Message.
+  if message.Message not in parent.__bases__:
+    raise NotImplementedError(
+        'Object tf.%s is a subclass of a generated proto Message. '
+        'They are not yet supported by the API tools.' % path)
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -120,7 +132,8 @@
                              actual_dict,
                              verbose=False,
                              update_goldens=False,
-                             additional_missing_object_message=''):
+                             additional_missing_object_message='',
+                             api_version=2):
     """Diff given dicts of protobufs and report differences a readable way.
 
     Args:
@@ -133,6 +146,7 @@
       update_goldens: Whether to update goldens when there are diffs found.
       additional_missing_object_message: Message to print when a symbol is
           missing.
+      api_version: TensorFlow API version to test.
     """
     diffs = []
     verbose_diffs = []
@@ -158,6 +172,8 @@
         diff_message = 'New object %s found (added).' % key
         verbose_diff_message = diff_message
       else:
+        # Do not truncate diff
+        self.maxDiffs = None  # pylint: disable=invalid-name
         # Now we can run an actual proto diff.
         try:
           self.assertProtoEquals(expected_dict[key], actual_dict[key])
@@ -188,13 +204,13 @@
         # If the keys are only in expected, some objects are deleted.
         # Remove files.
         for key in only_in_expected:
-          filepath = _KeyToFilePath(key)
+          filepath = _KeyToFilePath(key, api_version)
           file_io.delete_file(filepath)
 
         # If the files are only in actual (current library), these are new
         # modules. Write them to files. Also record all updates in files.
         for key in only_in_actual | set(updated_keys):
-          filepath = _KeyToFilePath(key)
+          filepath = _KeyToFilePath(key, api_version)
           file_io.write_string_to_file(
               filepath, text_format.MessageToString(actual_dict[key]))
       else:
@@ -205,33 +221,40 @@
       logging.info('No differences found between API and golden.')
 
   def testNoSubclassOfMessage(self):
-
-    def Visit(path, parent, unused_children):
-      """A Visitor that crashes on subclasses of generated proto classes."""
-      # If the traversed object is a proto Message class
-      if not (isinstance(parent, type) and
-              issubclass(parent, message.Message)):
-        return
-      if parent is message.Message:
-        return
-      # Check that it is a direct subclass of Message.
-      if message.Message not in parent.__bases__:
-        raise NotImplementedError(
-            'Object tf.%s is a subclass of a generated proto Message. '
-            'They are not yet supported by the API tools.' % path)
-    visitor = public_api.PublicAPIVisitor(Visit)
+    visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
     visitor.do_not_descend_map['tf'].append('contrib')
+    # Skip compat.v1 and compat.v2 since they are validated in separate tests.
+    visitor.private_map['tf.compat'] = ['v1', 'v2']
     traverse.traverse(tf, visitor)
 
-  def checkBackwardsCompatibility(self, root, golden_file_pattern):
-     # Extract all API stuff.
+  def testNoSubclassOfMessageV1(self):
+    if not hasattr(tf.compat, 'v1'):
+      return
+    visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
+    visitor.do_not_descend_map['tf'].append('contrib')
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testNoSubclassOfMessageV2(self):
+    if not hasattr(tf.compat, 'v2'):
+      return
+    visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
+    visitor.do_not_descend_map['tf'].append('contrib')
+    traverse.traverse(tf.compat.v2, visitor)
+
+  def _checkBackwardsCompatibility(
+      self, root, golden_file_pattern, api_version,
+      additional_private_map=None):
+    # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    traverse.traverse(root, public_api_visitor)
+    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = [
+        'Experimental']
+    if additional_private_map:
+      public_api_visitor.private_map.update(additional_private_map)
 
+    traverse.traverse(root, public_api_visitor)
     proto_dict = visitor.GetProtos()
 
     # Read all golden files.
@@ -254,27 +277,50 @@
         golden_proto_dict,
         proto_dict,
         verbose=FLAGS.verbose_diffs,
-        update_goldens=FLAGS.update_goldens)
+        update_goldens=FLAGS.update_goldens,
+        api_version=api_version)
 
   @unittest.skipUnless(
       sys.version_info.major == 2,
       'API compabitility test goldens are generated using python2.')
   def testAPIBackwardsCompatibility(self):
+    api_version = 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    self.checkBackwardsCompatibility(tf, golden_file_pattern)
+        _KeyToFilePath('*', api_version))
+    self._checkBackwardsCompatibility(
+        tf,
+        golden_file_pattern,
+        api_version,
+        # Skip compat.v1 and compat.v2 since they are validated
+        # in separate tests.
+        additional_private_map={'tf.compat': ['v1', 'v2']})
 
   @unittest.skipUnless(
       sys.version_info.major == 2,
       'API compabitility test goldens are generated using python2.')
   def testAPIBackwardsCompatibilityV1(self):
-    if not tf_v1:
+    if not hasattr(tf.compat, 'v1'):
       return
+    api_version = 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    self.checkBackwardsCompatibility(tf_v1, golden_file_pattern)
+        _KeyToFilePath('*', api_version))
+    self._checkBackwardsCompatibility(
+        tf.compat.v1, golden_file_pattern, api_version)
+
+  @unittest.skipUnless(
+      sys.version_info.major == 2,
+      'API compabitility test goldens are generated using python2.')
+  def testAPIBackwardsCompatibilityV2(self):
+    if not hasattr(tf.compat, 'v2'):
+      return
+    api_version = 2
+    golden_file_pattern = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _KeyToFilePath('*', api_version))
+    self._checkBackwardsCompatibility(
+        tf.compat.v2, golden_file_pattern, api_version)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index e8c3199..4587bcf 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -28,8 +28,8 @@
 RUN pip install --upgrade gast
 RUN pip install --upgrade numpy
 RUN pip install --upgrade termcolor
-RUN pip install keras_applications==1.0.2
-RUN pip install keras_preprocessing==1.0.1
+RUN pip install keras_applications==1.0.4
+RUN pip install keras_preprocessing==1.0.2
 
 # Install golang
 RUN apt-get install -t xenial-backports -y golang-1.9
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
index a404f12..e026edb 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
@@ -26,3 +26,6 @@
 # Configure the build for our CUDA configuration.
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
+
+# TODO get NCCL 2 in the docker image
+ENV TF_NCCL_VERSION 1
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index c3c5373..bb316ec 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,10 +115,10 @@
 pip3 install --upgrade setuptools==39.1.0
 
 # Keras
-pip2 install keras_applications==1.0.2
-pip3 install keras_applications==1.0.2
-pip2 install keras_preprocessing==1.0.1
-pip3 install keras_preprocessing==1.0.1
+pip2 install keras_applications==1.0.4 --no-deps
+pip3 install keras_applications==1.0.4 --no-deps
+pip2 install keras_preprocessing==1.0.2 --no-deps
+pip3 install keras_preprocessing==1.0.2 --no-deps
 
 # Install last working version of setuptools.
 pip2 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index b6f5de5..15e4396 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -85,8 +85,8 @@
 pip3.5 install --upgrade setuptools==39.1.0
 
 # Keras
-pip3.5 install keras_applications==1.0.2
-pip3.5 install keras_preprocessing==1.0.1
+pip3.5 install keras_applications==1.0.4
+pip3.5 install keras_preprocessing==1.0.2
 
 # Install last working version of setuptools.
 pip3.5 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 8868664..0fc3eee 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -101,7 +101,7 @@
 pip3 install --upgrade setuptools==39.1.0
 
 # Keras
-pip3 install keras_applications==1.0.2
-pip3 install keras_preprocessing==1.0.1
+pip3 install keras_applications==1.0.4
+pip3 install keras_preprocessing==1.0.2
 
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index a1d91a6..b497326 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -57,6 +57,17 @@
   TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
 
+# build the python3.6 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
+  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
+
 # Build containers for AVX2
 # Include the instructions for haswell and later, but tune for broadwell
 TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=haswell --copt=-mtune=broadwell --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
@@ -80,3 +91,13 @@
   TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
 
+# build the python3.6 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
+  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
diff --git a/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py2.sh b/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py2.sh
new file mode 100755
index 0000000..e13de35
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py2.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mcpu=power8 -mtune=power8'
+export PYTHON_BIN_PATH=`which python2`
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
+    --test_output=errors --test_size_filters=small,medium -- \
+    //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py3.sh b/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py3.sh
new file mode 100755
index 0000000..a04ac15
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/ppc64le/cpu/run_py3.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mcpu=power8 -mtune=power8'
+export PYTHON_BIN_PATH=`which python3`
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
+    --test_output=errors --test_size_filters=small,medium -- \
+    //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py2.sh b/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py2.sh
new file mode 100755
index 0000000..77286e8
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py2.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+LT_JOBS=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${LT_JOBS} local test job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python2`
+export CC_OPT_FLAGS='-mcpu=power8 -mtune=power8'
+
+export TF_NEED_CUDA=1
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --test_output=errors --local_test_jobs=${LT_JOBS} --build_tests_only --config=opt \
+    --test_size_filters=small,medium \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+    //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py3.sh b/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py3.sh
new file mode 100755
index 0000000..17aa52e
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/ppc64le/gpu/run_py3.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+LT_JOBS=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${LT_JOBS} local test job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mcpu=power8 -mtune=power8'
+
+export TF_NEED_CUDA=1
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --test_output=errors --local_test_jobs=${LT_JOBS} --build_tests_only --config=opt \
+    --test_size_filters=small,medium \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+    //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/common/public_api.py b/tensorflow/tools/common/public_api.py
index b40e415..09933d2 100644
--- a/tensorflow/tools/common/public_api.py
+++ b/tensorflow/tools/common/public_api.py
@@ -70,6 +70,8 @@
         'tf.app': ['flags'],
         # Imported for compatibility between py2/3.
         'tf.test': ['mock'],
+        # Externalized modules of the Keras API.
+        'tf.keras': ['applications', 'preprocessing']
     }
 
   @property
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index 8bdc03e..4bfcc25 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -48,6 +48,7 @@
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"google::protobuf::internal::ArenaImpl::AllocateAligned|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::internal::LogMessage|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
                            r"tensorflow::internal::LogMessage|"
                            r"tensorflow::internal::LogString|"
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index f8f63e2..df0fd05 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -24,27 +24,27 @@
 load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
 
 def _def_file_filter_configure_impl(repository_ctx):
-  if repository_ctx.os.name.lower().find("windows") == -1:
+    if repository_ctx.os.name.lower().find("windows") == -1:
+        repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+        repository_ctx.file("def_file_filter.py", "")
+        return
+    vc_path = find_vc_path(repository_ctx)
+    if vc_path == None:
+        auto_configure_fail("Visual C++ build tools not found on your machine")
+
+    undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+    if undname == None:
+        auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+    undname_bin_path = undname.replace("\\", "\\\\")
+
+    repository_ctx.template(
+        "def_file_filter.py",
+        Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
+        {
+            "%{undname_bin_path}": undname_bin_path,
+        },
+    )
     repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
-    repository_ctx.file("def_file_filter.py", "")
-    return
-  vc_path = find_vc_path(repository_ctx)
-  if vc_path == "visual-studio-not-found":
-    auto_configure_fail("Visual C++ build tools not found on your machine")
-
-  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
-  if undname == None:
-    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
-  undname_bin_path = undname.replace("\\", "\\\\")
-
-  repository_ctx.template(
-    "def_file_filter.py",
-    Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
-    {
-      "%{undname_bin_path}": undname_bin_path,
-    })
-  repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
-
 
 def_file_filter_configure = repository_rule(
     implementation = _def_file_filter_configure_impl,
@@ -55,6 +55,6 @@
         "VS100COMNTOOLS",
         "VS110COMNTOOLS",
         "VS120COMNTOOLS",
-        "VS140COMNTOOLS"
+        "VS140COMNTOOLS",
     ],
 )
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index bf06214..2c31d78 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -29,6 +29,8 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         numpy==1.14.5 \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 6552588..bacdea7 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -33,6 +33,8 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         mock \
         numpy==1.14.5 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index f4c83f8..4f89e3f 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -49,6 +49,8 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         mock \
         numpy==1.14.5 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 30bc2d2..056b475 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -37,6 +37,8 @@
 RUN pip --no-cache-dir install \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         numpy \
         scipy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index f0c7118..2df770e 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -18,18 +18,29 @@
         libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
+        libssl-dev \
         pkg-config \
-        python-dev \
-        ${PYTHON3_DEV} \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
         openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
+        openjdk-8-jre-headless
+
+#install Python 3
+RUN if [ ${PYTHON} = "python3.6" ]; then \
+      curl https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tar.xz -o /opt/python.tar.xz && \
+      cd /opt && tar xvf python.tar.xz && \
+      cd /opt/*/ && ./configure && \
+      make && make install; \
+    else \
+      apt-get install -y --no-install-recommends \
+        python-dev \
+        ${PYTHON3_DEV}; \
+    fi
+
+RUN    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
@@ -41,6 +52,8 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         mock \
         numpy \
@@ -51,7 +64,9 @@
     ${PYTHON} -m ipykernel.kernelspec
 
 RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
+      ln -s -f /usr/bin/python3 /usr/bin/python; \
+  elif [ "${PYTHON}" = "python3.6" ]; then \
+      ln -s -f /usr/local/bin/python3.6 /usr/bin/python; \
   fi
 
 # Set up our notebook config.
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
new file mode 100755
index 0000000..ab2eec1
--- /dev/null
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -0,0 +1,168 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="Cong Xu <cong.xu@intel.com>"
+
+# These parameters can be overridden by parameterized_docker_build.sh
+ARG TF_BUILD_VERSION=r1.9
+ARG PYTHON="python"
+ARG PYTHON3_DEV=""
+ARG WHL_DIR="/tmp/pip"
+ARG PIP="pip"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        ${PYTHON3_DEV} \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        wget \
+        numactl \
+        openssh-client \
+        openssh-server \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+    ${PYTHON} get-pip.py && \
+    rm get-pip.py
+
+RUN ${PIP} --no-cache-dir install \
+        Pillow \
+        h5py \
+        ipykernel \
+        jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
+        matplotlib \
+        mock \
+        numpy \
+        scipy \
+        sklearn \
+        pandas \
+        && \
+    ${PYTHON} -m ipykernel.kernelspec
+
+RUN if [ "${PYTHON}" = "python3" ]; then \
+  ln -s -f /usr/bin/python3 /usr/bin/python; \
+  fi
+
+# Set up our notebook config.
+COPY jupyter_notebook_config.py /root/.jupyter/
+
+# Jupyter has issues with being run directly:
+#   https://github.com/ipython/ipython/issues/7062
+# We just add a little wrapper script.
+COPY run_jupyter.sh /
+
+# Set up Bazel.
+
+# Running bazel inside a `docker build` command causes trouble, cf:
+#   https://github.com/bazelbuild/bazel/issues/134
+# The easiest solution is to set up a bazelrc file forcing --batch.
+RUN echo "startup --batch" >>/etc/bazel.bazelrc
+# Similarly, we need to workaround sandboxing issues:
+#   https://github.com/bazelbuild/bazel/issues/418
+RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
+    >>/etc/bazel.bazelrc
+# Install the most recent bazel release.
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
+    chmod +x bazel-*.sh && \
+    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    cd / && \
+    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
+
+# Download and build TensorFlow.
+WORKDIR /tensorflow
+
+# Download and build TensorFlow.
+# Enable checking out both tags and branches
+RUN export TAG_PREFIX="v" && \
+    echo ${TF_BUILD_VERSION} | grep -q ^${TAG_PREFIX}; \
+    if [ $? -eq 0 ]; then \
+        git clone --depth=1 https://github.com/tensorflow/tensorflow.git . && \
+        git fetch --tags && \
+        git checkout ${TF_BUILD_VERSION}; \
+   else \
+        git clone --depth=1 --branch=${TF_BUILD_VERSION} https://github.com/tensorflow/tensorflow.git . ; \
+    fi
+
+RUN yes "" | ${PYTHON} configure.py
+
+ENV CI_BUILD_PYTHON ${PYTHON}
+
+# Set bazel build parameters in .bazelrc in parameterized_docker_build.sh
+# Use --copt=-march values to get optimized builds appropriate for the hardware
+#   platform of your choice.
+# For ivy-bridge or sandy-bridge
+# --copt=-march="avx" \
+# For haswell, broadwell, or skylake
+# --copt=-march="avx2" \
+COPY .bazelrc /root/.bazelrc
+
+RUN tensorflow/tools/ci_build/builds/configured CPU \
+    bazel --bazelrc=/root/.bazelrc build -c opt \
+    tensorflow/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
+    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
+    rm -rf /root/.cache
+# Clean up Bazel cache when done.
+
+WORKDIR /root
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi && \
+    cd /tmp/openmpi && \
+    wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
+    tar zxf openmpi-3.0.0.tar.gz && \
+    cd openmpi-3.0.0 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
+    chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Install Horovod
+RUN ${PIP} install --no-cache-dir horovod
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# TensorBoard
+EXPOSE 6006
+# IPython
+EXPOSE 8888
+
+WORKDIR /root
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 5ec1e60..aa0e0fa 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -37,6 +37,8 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         numpy==1.14.5 \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index 139395d..6955330 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -20,7 +20,7 @@
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python \
+        ${PYTHON} \
         ${PYTHON_DEV} \
         rsync \
         software-properties-common \
@@ -30,7 +30,7 @@
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
+    ${PYTHON} get-pip.py && \
     rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
@@ -38,13 +38,15 @@
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
         matplotlib \
         numpy \
         pandas \
         scipy \
         sklearn \
         && \
-    python -m ipykernel.kernelspec
+    ${PYTHON} -m ipykernel.kernelspec
 
 COPY ${TF_WHL_URL} /
 RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
new file mode 100755
index 0000000..756716e
--- /dev/null
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -0,0 +1,111 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="Cong Xu <cong.xu@intel.com>"
+
+# This parameter MUST be set by parameterized_docker_build.sh
+ARG TF_WHL_URL
+
+# Optional parameters
+ARG TF_BUILD_VERSION=r1.9
+ARG PYTHON="python"
+ARG PYTHON_DEV="python-dev"
+ARG PIP="pip"
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python \
+        ${PYTHON_DEV} \
+        rsync \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+RUN ${PIP} --no-cache-dir install \
+        Pillow \
+        h5py \
+        ipykernel \
+        jupyter \
+        keras_applications==1.0.4 \
+        keras_preprocessing==1.0.2 \
+        matplotlib \
+        numpy \
+        pandas \
+        scipy \
+        sklearn \
+        && \
+    python -m ipykernel.kernelspec
+
+COPY ${TF_WHL_URL} /
+RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
+    rm -rf /${TF_WHL_URL}
+
+RUN if [ "${PYTHON}" = "python3" ]; then \
+  ln -s -f /usr/bin/python3 /usr/bin/python; \
+  fi
+
+# Set up our notebook config.
+COPY jupyter_notebook_config.py /root/.jupyter/
+
+# Copy sample notebooks.
+COPY notebooks /notebooks
+
+# Jupyter has issues with being run directly:
+#   https://github.com/ipython/ipython/issues/7062
+# We just add a little wrapper script.
+COPY run_jupyter.sh /
+
+WORKDIR /root
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi && \
+    cd /tmp/openmpi && \
+    wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
+    tar zxf openmpi-3.0.0.tar.gz && \
+    cd openmpi-3.0.0 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
+    chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Install Horovod
+RUN ${PIP} install --no-cache-dir horovod
+
+# Install OpenSSH for MPI to communicate between containers
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# TensorBoard
+EXPOSE 6006
+# IPython
+EXPOSE 8888
+
+WORKDIR "/notebooks"
+
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 4681c5f..448a3a7 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -19,8 +19,8 @@
 #   parameterized_docker_build.sh
 #
 # The script obeys the following environment variables:
-#   TF_DOCKER_BUILD_TYPE: (CPU | GPU | MKL)
-#     CPU, GPU, or MKL image
+#   TF_DOCKER_BUILD_TYPE: (CPU | GPU | MKL | MKL-HOROVOD)
+#     CPU, GPU, MKL or MKL-HOROVOD image
 #
 #   TF_DOCKER_BUILD_IS_DEVEL: (NO | YES)
 #     Is this developer image
@@ -169,6 +169,15 @@
   else
     ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.mkl"
   fi
+elif [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
+  DOCKER_BINARY="docker"
+  FINAL_TAG="${FINAL_TAG}-mkl-horovod"
+  if [[ ${ORIG_DOCKERFILE} == *"."* ]]; then
+    # There is already a dot in the tag, use "-"
+    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}-mkl-horovod"
+  else
+    ORIG_DOCKERFILE="${ORIG_DOCKERFILE}.mkl-horovod"
+  fi
 elif   [[ ${TF_DOCKER_BUILD_TYPE} == "gpu" ]]; then
   DOCKER_BINARY="nvidia-docker"
 
@@ -188,6 +197,8 @@
   :
 elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
   FINAL_TAG="${FINAL_TAG}-py3"
+elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
+  FINAL_TAG="${FINAL_TAG}-py3.6"
 else
   die "Unrecognized value in TF_DOCKER_BUILD_PYTHON_VERSION: "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION}"
@@ -227,6 +238,10 @@
       die "FAIL: Non-development MKL builds require a pre-built pip whl."
     fi
 
+    if [[ "${TF_DOCKER_BUILD_TYPE}" == "mkl-horovod" ]]; then
+      die "FAIL: Non-development MKL-HOROVOD builds require a pre-built pip whl."
+    fi
+
     if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
       export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
   "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
@@ -279,7 +294,8 @@
     # Use string replacement to put the correct file name into the Dockerfile
     PIP_WHL=$(basename "${PIP_WHL}")
 
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
+    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
+        [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
       TF_DOCKER_BUILD_ARGS+=("--build-arg TF_WHL_URL=${PIP_WHL}" )
       cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
     else
@@ -295,7 +311,8 @@
     echo
   else
     echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
+    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
+        [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
       pushd "${TMP_DIR}/"
       curl -O ${TF_DOCKER_BUILD_CENTRAL_PIP}
       popd
@@ -319,7 +336,8 @@
 
   # Modify python/pip version if necessary.
   if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
+    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
+          [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${TF_DOCKER_BUILD_PYTHON_VERSION}")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON_DEV=python3-dev")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
@@ -340,8 +358,9 @@
 else # TF_DOCKER_BUILD_IS_DEVEL == 'yes'
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
-  # Set up Dockerfile ARGS for mkl build
-  if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
+  # Set up Dockerfile ARGS for mkl and mkl-horovod build
+  if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
+      [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
     if [[ -z "${TF_BAZEL_BUILD_OPTIONS// }" ]]; then
       TF_BAZEL_BUILD_OPTIONS=("--config=mkl --copt=-mavx --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0")
     else
@@ -360,14 +379,17 @@
   fi
 
   # Modify python/pip version if necessary.
-  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]]; then
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]] || [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
+    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${TF_DOCKER_BUILD_PYTHON_VERSION}")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON3_DEV=python3-dev")
         TF_DOCKER_BUILD_ARGS+=("--build-arg WHL_DIR=/tmp/pip3")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
         cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
     else
+      if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]] && [[ "${TF_DOCKER_BUILD_TYPE}" != "mkl" ]]; then
+        die "Python 3.6 build only supported for MKL builds."
+      fi
       if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
          sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
          sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index cc7885a..4f7efe1 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -34,11 +34,29 @@
 )
 
 py_library(
+    name = "doc_controls",
+    srcs = ["doc_controls.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "doc_controls_test",
+    size = "small",
+    srcs = ["doc_controls_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":doc_controls",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_library(
     name = "parser",
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":doc_controls",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "@astor_archive//:astor",
@@ -68,6 +86,7 @@
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":doc_controls",
         ":doc_generator_visitor",
         ":parser",
         ":pretty_docs",
diff --git a/tensorflow/tools/docs/doc_controls.py b/tensorflow/tools/docs/doc_controls.py
new file mode 100644
index 0000000..5e52644
--- /dev/null
+++ b/tensorflow/tools/docs/doc_controls.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Documentation control decorators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+_DO_NOT_DOC = "_tf_docs_do_not_document"
+
+
+def do_not_generate_docs(obj):
+  """A decorator: Do not generate docs for this object.
+
+  For example the following classes:
+
+  ```
+  class Parent(object):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  Produce the following api_docs:
+
+  ```
+  /Parent.md
+    # method1
+    # method2
+  /Child.md
+    # method1
+    # method2
+  ```
+
+  This decorator allows you to skip classes or methods:
+
+  ```
+  @do_not_generate_docs
+  class Parent(object):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    @do_not_generate_docs
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  This will only produce the following docs:
+
+  ```
+  /Child.md
+    # method2
+  ```
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @do_not_generate_docs
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The object to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _DO_NOT_DOC, None)
+  return obj
+
+
+_DO_NOT_DOC_INHERITABLE = "_tf_docs_do_not_doc_inheritable"
+
+
+def do_not_doc_inheritable(obj):
+  """A decorator: Do not generate docs for this method.
+
+  This version of the decorator is "inherited" by subclasses. No docs will be
+  generated for the decorated method in any subclass. Even if the sub-class
+  overrides the method.
+
+  For example, to ensure that `method1` is **never documented** use this
+  decorator on the base-class:
+
+  ```
+  class Parent(object):
+    @do_not_doc_inheritable
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+  This will produce the following docs:
+
+  ```
+  /Parent.md
+    # method2
+  /Child.md
+    # method2
+  ```
+
+  When generating docs for a class's arributes, the `__mro__` is searched and
+  the attribute will be skipped if this decorator is detected on the attribute
+  on any class in the `__mro__`.
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @do_not_doc_inheritable
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The class-attribute to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _DO_NOT_DOC_INHERITABLE, None)
+  return obj
+
+
+_FOR_SUBCLASS_IMPLEMENTERS = "_tf_docs_tools_for_subclass_implementers"
+
+
+def for_subclass_implementers(obj):
+  """A decorator: Only generate docs for this method in the defining class.
+
+  Also group this method's docs with and `@abstractmethod` in the class's docs.
+
+  No docs will generated for this class attribute in sub-classes.
+
+  The canonical use case for this is `tf.keras.layers.Layer.call`: It's a
+  public method, essential for anyone implementing a subclass, but it should
+  never be called directly.
+
+  Works on method, or other class-attributes.
+
+  When generating docs for a class's arributes, the `__mro__` is searched and
+  the attribute will be skipped if this decorator is detected on the attribute
+  on any **parent** class in the `__mro__`.
+
+  For example:
+
+  ```
+  class Parent(object):
+    @for_subclass_implementers
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child1(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child2(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  This will produce the following docs:
+
+  ```
+  /Parent.md
+    # method1
+    # method2
+  /Child1.md
+    # method2
+  /Child2.md
+    # method2
+  ```
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @for_subclass_implementers
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The class-attribute to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _FOR_SUBCLASS_IMPLEMENTERS, None)
+  return obj
+
+
+def should_skip(obj):
+  """Returns true if docs generation should be skipped for this object.
+
+  checks for the `do_not_generate_docs` or `do_not_doc_inheritable` decorators.
+
+  Args:
+    obj: The object to document, or skip.
+
+  Returns:
+    True if the object should be skipped
+  """
+  # Unwrap fget if the object is a property
+  if isinstance(obj, property):
+    obj = obj.fget
+
+  return hasattr(obj, _DO_NOT_DOC) or hasattr(obj, _DO_NOT_DOC_INHERITABLE)
+
+
+def should_skip_class_attr(cls, name):
+  """Returns true if docs should be skipped for this class attribute.
+
+  Args:
+    cls: The class the attribute belongs to.
+    name: The name of the attribute.
+
+  Returns:
+    True if the attribute should be skipped.
+  """
+  # Get the object with standard lookup, from the nearest
+  # defining parent.
+  try:
+    obj = getattr(cls, name)
+  except AttributeError:
+    # Avoid error caused by enum metaclasses in python3
+    if name in ("name", "value"):
+      return True
+    raise
+
+  # Unwrap fget if the object is a property
+  if isinstance(obj, property):
+    obj = obj.fget
+
+  # Skip if the object is decorated with `do_not_generate_docs` or
+  # `do_not_doc_inheritable`
+  if should_skip(obj):
+    return True
+
+  # Use __dict__ lookup to get the version defined in *this* class.
+  obj = cls.__dict__.get(name, None)
+  if isinstance(obj, property):
+    obj = obj.fget
+  if obj is not None:
+    # If not none, the object is defined in *this* class.
+    # Do not skip if decorated with `for_subclass_implementers`.
+    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
+      return False
+
+  # for each parent class
+  for parent in cls.__mro__[1:]:
+    obj = getattr(parent, name, None)
+
+    if obj is None:
+      continue
+
+    if isinstance(obj, property):
+      obj = obj.fget
+
+    # Skip if the parent's definition is decorated with `do_not_doc_inheritable`
+    # or `for_subclass_implementers`
+    if hasattr(obj, _DO_NOT_DOC_INHERITABLE):
+      return True
+
+    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
+      return True
+
+  # No blockng decorators --> don't skip
+  return False
diff --git a/tensorflow/tools/docs/doc_controls_test.py b/tensorflow/tools/docs/doc_controls_test.py
new file mode 100644
index 0000000..410342f
--- /dev/null
+++ b/tensorflow/tools/docs/doc_controls_test.py
@@ -0,0 +1,183 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for documentation control decorators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import doc_controls
+
+
+class DocControlsTest(googletest.TestCase):
+
+  def test_do_not_generate_docs(self):
+
+    @doc_controls.do_not_generate_docs
+    def dummy_function():
+      pass
+
+    self.assertTrue(doc_controls.should_skip(dummy_function))
+
+  def test_do_not_doc_on_method(self):
+    """The simple decorator is not aware of inheritance."""
+
+    class Parent(object):
+
+      @doc_controls.do_not_generate_docs
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertFalse(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable(self):
+
+    class Parent(object):
+
+      @doc_controls.do_not_doc_inheritable
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable_property(self):
+
+    class Parent(object):
+
+      @property
+      @doc_controls.do_not_doc_inheritable
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      @property
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable_staticmethod(self):
+
+    class GrandParent(object):
+
+      def my_method(self):
+        pass
+
+    class Parent(GrandParent):
+
+      @staticmethod
+      @doc_controls.do_not_doc_inheritable
+      def my_method():
+        pass
+
+    class Child(Parent):
+
+      @staticmethod
+      def my_method():
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertFalse(doc_controls.should_skip(GrandParent.my_method))
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def testfor_subclass_implementers(self):
+
+    class GrandParent(object):
+
+      def my_method(self):
+        pass
+
+    class Parent(GrandParent):
+
+      @doc_controls.for_subclass_implementers
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+      pass
+
+    class GrandChild(Child):
+
+      def my_method(self):
+        pass
+
+    class Grand2Child(Child):
+      pass
+
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
+    self.assertFalse(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index e5eaf8c..a66f3e4 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -269,7 +269,6 @@
         # Choose the master name with a lexical sort on the tuples returned by
         # by _score_name.
         master_name = min(names, key=self._score_name)
-        print(names, master_name)
 
       duplicates[master_name] = names
       for name in names:
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 7c8dbd5..9387042 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -28,6 +28,7 @@
 from tensorflow.python.util import tf_inspect
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
+from tensorflow.tools.docs import doc_controls
 from tensorflow.tools.docs import doc_generator_visitor
 from tensorflow.tools.docs import parser
 from tensorflow.tools.docs import pretty_docs
@@ -96,7 +97,7 @@
   symbol_to_file = {}
 
   # Collect redirects for an api _redirects.yaml file.
-  redirects = ['redirects:\n']
+  redirects = []
 
   # Parse and write Markdown pages, resolving cross-links (@{symbol}).
   for full_name, py_object in six.iteritems(parser_config.index):
@@ -110,6 +111,9 @@
             _is_free_function(py_object, full_name, parser_config.index)):
       continue
 
+    if doc_controls.should_skip(py_object):
+      continue
+
     sitepath = os.path.join('api_docs/python',
                             parser.documentation_path(full_name)[:-3])
 
@@ -162,17 +166,20 @@
         continue
 
       duplicates = [item for item in duplicates if item != full_name]
-      template = ('- from: /{}\n'
-                  '  to: /{}\n')
+
       for dup in duplicates:
         from_path = os.path.join(site_api_path, dup.replace('.', '/'))
         to_path = os.path.join(site_api_path, full_name.replace('.', '/'))
-        redirects.append(
-            template.format(from_path, to_path))
+        redirects.append((from_path, to_path))
 
-  if site_api_path:
+  if site_api_path and redirects:
+    redirects = sorted(redirects)
+    template = ('- from: /{}\n'
+                '  to: /{}\n')
+    redirects = [template.format(f, t) for f, t in redirects]
     api_redirects_path = os.path.join(output_dir, '_redirects.yaml')
     with open(api_redirects_path, 'w') as redirect_file:
+      redirect_file.write('redirects:\n')
       redirect_file.write(''.join(redirects))
 
   if yaml_toc:
@@ -235,12 +242,16 @@
 
 # Exclude some libraries in contrib from the documentation altogether.
 def _get_default_private_map():
-  return {'tf.test': ['mock']}
+  return {
+      'tf.contrib.autograph': ['utils', 'operators'],
+      'tf.test': ['mock'],
+      'tf.compat': ['v1', 'v2'],
+  }
 
 
 # Exclude members of some libraries.
 def _get_default_do_not_descend_map():
-  # TODO(wicke): Shrink this list once the modules get sealed.
+  # TODO(markdaoust): Use docs_controls decorators, locally, instead.
   return {
       'tf': ['cli', 'lib', 'wrappers'],
       'tf.contrib': [
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index ffb9302..801c8bc 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -32,6 +32,7 @@
 from google.protobuf.message import Message as ProtoMessage
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import doc_controls
 
 
 # A regular expression capturing a python identifier.
@@ -1175,15 +1176,18 @@
 
       # Don't document anything that is defined in object or by protobuf.
       defining_class = _get_defining_class(py_class, short_name)
-      if (defining_class is object or
-          defining_class is type or defining_class is tuple or
-          defining_class is BaseException or defining_class is Exception or
-          # The following condition excludes most protobuf-defined symbols.
-          defining_class and defining_class.__name__ in ['CMessage', 'Message',
-                                                         'MessageMeta']):
+      if defining_class in [object, type, tuple, BaseException, Exception]:
+        continue
+
+      # The following condition excludes most protobuf-defined symbols.
+      if (defining_class and
+          defining_class.__name__ in ['CMessage', 'Message', 'MessageMeta']):
         continue
       # TODO(markdaoust): Add a note in child docs showing the defining class.
 
+      if doc_controls.should_skip_class_attr(py_class, short_name):
+        continue
+
       child_doc = _parse_md_docstring(child, relative_path,
                                       parser_config.reference_resolver)
 
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 274d48e..9f6b185 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -24,6 +24,7 @@
 
 from tensorflow.python.platform import googletest
 from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import doc_controls
 from tensorflow.tools.docs import parser
 
 
@@ -37,13 +38,27 @@
   pass
 
 
-class TestClass(object):
+class ParentClass(object):
+
+  @doc_controls.do_not_doc_inheritable
+  def hidden_method(self):
+    pass
+
+
+class TestClass(ParentClass):
   """Docstring for TestClass itself."""
 
   def a_method(self, arg='default'):
     """Docstring for a method."""
     pass
 
+  def hidden_method(self):
+    pass
+
+  @doc_controls.do_not_generate_docs
+  def hidden_method2(self):
+    pass
+
   class ChildClass(object):
     """Docstring for a child class."""
     pass
@@ -175,6 +190,104 @@
     # Make sure this file is contained as the definition location.
     self.assertEqual(os.path.relpath(__file__, '/'), page_info.defined_in.path)
 
+  def test_docs_for_class_should_skip(self):
+
+    class Parent(object):
+
+      @doc_controls.do_not_doc_inheritable
+      def a_method(self, arg='default'):
+        pass
+
+    class Child(Parent):
+
+      def a_method(self, arg='default'):
+        pass
+
+    index = {
+        'Child': Child,
+        'Child.a_method': Child.a_method,
+    }
+
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    tree = {
+        'Child': ['a_method'],
+    }
+
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
+
+    page_info = parser.docs_for_object(
+        full_name='Child', py_object=Child, parser_config=parser_config)
+
+    # Make sure the `a_method` is not present
+    self.assertEqual(0, len(page_info.methods))
+
+  def test_docs_for_message_class(self):
+
+    class CMessage(object):
+
+      def hidden(self):
+        pass
+
+    class Message(object):
+
+      def hidden2(self):
+        pass
+
+    class MessageMeta(object):
+
+      def hidden3(self):
+        pass
+
+    class ChildMessage(CMessage, Message, MessageMeta):
+
+      def my_method(self):
+        pass
+
+    index = {
+        'ChildMessage': ChildMessage,
+        'ChildMessage.hidden': ChildMessage.hidden,
+        'ChildMessage.hidden2': ChildMessage.hidden2,
+        'ChildMessage.hidden3': ChildMessage.hidden3,
+        'ChildMessage.my_method': ChildMessage.my_method,
+    }
+
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    tree = {'ChildMessage': ['hidden', 'hidden2', 'hidden3', 'my_method']}
+
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
+
+    page_info = parser.docs_for_object(
+        full_name='ChildMessage',
+        py_object=ChildMessage,
+        parser_config=parser_config)
+
+    self.assertEqual(1, len(page_info.methods))
+    self.assertEqual('my_method', page_info.methods[0].short_name)
+
   def test_docs_for_module(self):
     # Get the current module.
     module = sys.modules[__name__]
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 44d8a37..b450bc4 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -4,7 +4,9 @@
 package(default_visibility = ["//visibility:private"])
 
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
+load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs")
+load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
 
 genrule(
@@ -113,11 +115,8 @@
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
-        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
-        "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
-        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
@@ -125,13 +124,8 @@
         "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
-        "@grpc//:LICENSE",
-        "@grpc//third_party/address_sorting:LICENSE",
-        "@grpc//third_party/nanopb:LICENSE.txt",
         "@highwayhash//:LICENSE",
-        "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
-        "@libxsmm_archive//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
@@ -141,10 +135,42 @@
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ] + if_mkl([
+    ] + select({
+        "//tensorflow:with_aws_support": [
+            "@aws//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_gcp_support": [
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_jemalloc_linux_x86_64": [
+            "@jemalloc//:COPYING",
+        ],
+        "//tensorflow:with_jemalloc_linux_ppc64le": [
+            "@jemalloc//:COPYING",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow/core/kernels:xsmm": [
+            "@libxsmm_archive//:LICENSE.md",
+        ],
+        "//conditions:default": [],
+    }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
+    ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
-    ]),
+    ]) + if_not_system_lib(
+        "grpc",
+        [
+            "@grpc//:LICENSE",
+            "@grpc//third_party/nanopb:LICENSE.txt",
+            "@grpc//third_party/address_sorting:LICENSE",
+        ],
+    ),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -156,11 +182,8 @@
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
-        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
-        "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
-        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
@@ -169,9 +192,7 @@
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
-        "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
-        "@libxsmm_archive//:LICENSE.md",
         "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
@@ -181,7 +202,32 @@
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ] + if_mkl([
+    ] + select({
+        "//tensorflow:with_aws_support": [
+            "@aws//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_gcp_support": [
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_jemalloc_linux_x86_64": [
+            "@jemalloc//:COPYING",
+        ],
+        "//tensorflow:with_jemalloc_linux_ppc64le": [
+            "@jemalloc//:COPYING",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow/core/kernels:xsmm": [
+            "@libxsmm_archive//:LICENSE.md",
+        ],
+        "//conditions:default": [],
+    }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
+    ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
     ]),
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 06ee230..00c1337 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -9,7 +9,7 @@
     "if_windows",
     "transitive_hdrs",
 )
-load("//third_party/mkl:build_defs.bzl", "if_mkl")
+load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
@@ -131,13 +131,9 @@
         "@absl_py//absl/flags:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astor_archive//:LICENSE",
-        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
-        "@com_github_googleapis_googleapis//:LICENSE",
-        "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
-        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
@@ -148,12 +144,8 @@
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
-        "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
-        "@kafka//:LICENSE",
-        "@libxsmm_archive//:LICENSE.md",
         "@lmdb//:LICENSE",
-        "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
@@ -166,7 +158,39 @@
         "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
-    ] + if_mkl([
+    ] + select({
+        "//tensorflow:with_aws_support": [
+            "@aws//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_gcp_support": [
+            "@com_github_googleapis_googleapis//:LICENSE",
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_jemalloc_linux_x86_64": [
+            "@jemalloc//:COPYING",
+        ],
+        "//tensorflow:with_jemalloc_linux_ppc64le": [
+            "@jemalloc//:COPYING",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_kafka_support": [
+            "@kafka//:LICENSE",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow/core/kernels:xsmm": [
+            "@libxsmm_archive//:LICENSE.md",
+        ],
+        "//conditions:default": [],
+    }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
+    ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
     ]) + if_not_system_lib(
@@ -184,14 +208,13 @@
     srcs = ["build_pip_package.sh"],
     data = select({
         "//tensorflow:windows": [":simple_console_for_windows"],
-        "//tensorflow:windows_msvc": [":simple_console_for_windows"],
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
             "//tensorflow/contrib/lite/python:tflite_convert",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
-    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
+    }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
 
 # A genrule for generating a marker file for the pip package on Windows
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index ca40f2e..666ea75 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -44,7 +44,7 @@
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
 function is_windows() {
   # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+  if [[ "${PLATFORM}" =~ (mingw64|msys)_nt* ]]; then
     true
   else
     false
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 401f833..bfc007b 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -90,6 +90,7 @@
     "//tensorflow/contrib/lite/python:interpreter.py",
     "//tensorflow/contrib/lite/python:interpreter_test.py",
     "//tensorflow/contrib/ffmpeg:test_data",
+    "//tensorflow/contrib/hadoop:test_data",
     "//tensorflow/contrib/factorization/examples:mnist",
     "//tensorflow/contrib/factorization/examples:mnist.py",
     "//tensorflow/contrib/factorization:factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",  # pylint:disable=line-too-long
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 085f3dd..5e17907 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,12 +45,14 @@
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.10.0-rc1'
+_VERSION = '1.10.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'keras_applications == 1.0.4',
+    'keras_preprocessing == 1.0.2',
     'numpy >= 1.13.3, <= 1.14.5',
     'six >= 1.10.0',
     'protobuf >= 3.6.0',
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 31e8fb9..b4b70e0 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -39,6 +39,7 @@
         ":gen_proto_text_functions_lib",
         "@protobuf_archive//:protobuf",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:lib_proto_compiler",
     ] + if_ios(["//tensorflow/core/platform/default/build_config:logging"]),
 )
 
@@ -49,7 +50,6 @@
     copts = if_ios(["-DGOOGLE_LOGGING"]),
     linkopts = select({
         "//tensorflow:windows": [],
-        "//tensorflow:windows_msvc": [],
         "//tensorflow:darwin": [
             "-lm",
             "-lpthread",
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index 234afe8..159976f 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -18,6 +18,7 @@
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/protobuf_compiler.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/tools/proto_text/gen_proto_text_functions_lib.h"
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5ad05b2..1847335 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -19,6 +19,10 @@
     "//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
     "def_file_filter_configure",
 )
+load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
+
+def initialize_third_party():
+    flatbuffers()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -40,6 +44,8 @@
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
 
+    initialize_third_party()
+
     # For windows bazel build
     # TODO: Remove def file filter when TensorFlow can export symbols properly on Windows.
     def_file_filter_configure(name = "local_config_def_file_filter")
@@ -157,11 +163,11 @@
     tf_http_archive(
         name = "com_googlesource_code_re2",
         urls = [
-            "https://mirror.bazel.build/github.com/google/re2/archive/2018-04-01.tar.gz",
-            "https://github.com/google/re2/archive/2018-04-01.tar.gz",
+            "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz",
+            "https://github.com/google/re2/archive/2018-07-01.tar.gz",
         ],
-        sha256 = "2f945446b71336e7f5a2bcace1abcf0b23fbba368266c6a1be33de3de3b3c912",
-        strip_prefix = "re2-2018-04-01",
+        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
+        strip_prefix = "re2-2018-07-01",
         system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
     )
 
@@ -405,11 +411,11 @@
     tf_http_archive(
         name = "com_google_googletest",
         urls = [
-            "https://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
-            "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
+            "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
+            "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
         ],
-        sha256 = "9cbca84c4256bed17df2c8f4d00c912c19d247c11c9ba6647cd6dd5b5c996b8d",
-        strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
+        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
+        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
     )
 
     tf_http_archive(
@@ -486,11 +492,11 @@
     tf_http_archive(
         name = "llvm",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/5aa74422b69e309587c4e60e98649fb8a027d260.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/5aa74422b69e309587c4e60e98649fb8a027d260.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/6203c9bd082a877a20c218033636712135a3c2db.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/6203c9bd082a877a20c218033636712135a3c2db.tar.gz",
         ],
-        sha256 = "23371dc9cc589c2226780361012547a49c1125db6f755731216887238fb4738e",
-        strip_prefix = "llvm-5aa74422b69e309587c4e60e98649fb8a027d260",
+        sha256 = "83a80f9fb2a5949ca77e526344cbd4581388c3ec7fea5c59e488d46fd38e06d9",
+        strip_prefix = "llvm-6203c9bd082a877a20c218033636712135a3c2db",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
     )
 
@@ -733,18 +739,6 @@
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
     )
 
-    tf_http_archive(
-        name = "flatbuffers",
-        strip_prefix = "flatbuffers-1.9.0",
-        sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3",
-        urls = [
-            "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
-            "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
-        ],
-        build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:flatbuffers.BUILD"),
-    )
-
     native.new_http_archive(
         name = "double_conversion",
         urls = [
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 1638b72..c93fac6 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -243,7 +243,6 @@
             "lib/vtls/darwinssl.c",
         ],
         "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
-        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_SRCS,
         "//conditions:default": [
             "lib/vtls/openssl.c",
         ],
@@ -260,7 +259,6 @@
     ],
     copts = select({
         "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
-        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
@@ -280,10 +278,6 @@
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
-        "@org_tensorflow//tensorflow:windows_msvc": [
-            # See curl.h for discussion of write size and Windows
-            "/DCURL_MAX_WRITE_SIZE=16384",
-        ],
         "//conditions:default": [
             "-DCURL_MAX_WRITE_SIZE=65536",
         ],
@@ -307,12 +301,6 @@
             "-DEFAULTLIB:crypt32.lib",
             "-DEFAULTLIB:Normaliz.lib",
         ],
-        "@org_tensorflow//tensorflow:windows_msvc": [
-            "-DEFAULTLIB:ws2_32.lib",
-            "-DEFAULTLIB:advapi32.lib",
-            "-DEFAULTLIB:crypt32.lib",
-            "-DEFAULTLIB:Normaliz.lib",
-        ],
         "//conditions:default": [
             "-lrt",
         ],
@@ -323,7 +311,6 @@
     ] + select({
         "@org_tensorflow//tensorflow:ios": [],
         "@org_tensorflow//tensorflow:windows": [],
-        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -426,7 +413,6 @@
     ],
     copts = select({
         "@org_tensorflow//tensorflow:windows": CURL_BIN_WIN_COPTS,
-        "@org_tensorflow//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
diff --git a/third_party/double_conversion.BUILD b/third_party/double_conversion.BUILD
index 9f90521..d875a1a 100644
--- a/third_party/double_conversion.BUILD
+++ b/third_party/double_conversion.BUILD
@@ -4,6 +4,11 @@
 
 exports_files(["LICENSE"])
 
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
 cc_library(
     name = "double-conversion",
     srcs = [
@@ -28,11 +33,10 @@
         "double-conversion/ieee.h",
         "double-conversion/strtod.h",
     ],
-    includes = [
-        ".",
-    ],
-    linkopts = [
-        "-lm",
-    ],
+    includes = ["."],
+    linkopts = select({
+        ":windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/farmhash.BUILD b/third_party/farmhash.BUILD
index a51e151..4b84646 100644
--- a/third_party/farmhash.BUILD
+++ b/third_party/farmhash.BUILD
@@ -3,13 +3,6 @@
 exports_files(["COPYING"])
 
 config_setting(
-    name = "windows_msvc",
-    values = {
-        "cpu": "x64_windows_msvc",
-    },
-)
-
-config_setting(
     name = "windows",
     values = {
         "cpu": "x64_windows",
@@ -23,7 +16,6 @@
     # Disable __builtin_expect support on Windows
     copts = select({
         ":windows": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
-        ":windows_msvc": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
         "//conditions:default": [],
     }),
     includes = ["src/."],
diff --git a/third_party/fft2d/fft2d.BUILD b/third_party/fft2d/fft2d.BUILD
index 3dbd36a..74dd311 100644
--- a/third_party/fft2d/fft2d.BUILD
+++ b/third_party/fft2d/fft2d.BUILD
@@ -14,6 +14,11 @@
     "fft/fftsg.c",
 ]
 
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
 # This is the main 2D FFT library.  The 2D FFTs in this library call
 # 1D FFTs.  In addition, fast DCTs are provided for the special case
 # of 8x8 and 16x16.  This code in this library is referred to as
@@ -21,7 +26,10 @@
 cc_library(
     name = "fft2d",
     srcs = FFT2D_SRCS,
-    linkopts = ["-lm"],
+    linkopts = select({
+        ":windows": [],
+        "//conditions:default": ["-lm"],
+    }),
 )
 
 objc_library(
diff --git a/third_party/flatbuffers/BUILD b/third_party/flatbuffers/BUILD
index fbdf19f..82bab3f 100644
--- a/third_party/flatbuffers/BUILD
+++ b/third_party/flatbuffers/BUILD
@@ -1,15 +1 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
+# This empty BUILD file is required to make Bazel treat this directory as a package.
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/BUILD.bazel
similarity index 93%
rename from third_party/flatbuffers/flatbuffers.BUILD
rename to third_party/flatbuffers/BUILD.bazel
index 639dff2..9d233a3 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/BUILD.bazel
@@ -12,12 +12,17 @@
     visibility = ["//visibility:public"],
 )
 
-FLATBUFFERS_COPTS = [
-    "-fexceptions",
-] + select({
-    "@bazel_tools//src:windows": [],
-    "@bazel_tools//src:windows_msvc": [],
-    "//conditions:default": ["-Wno-implicit-fallthrough"],
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
+FLATBUFFERS_COPTS = select({
+    ":windows": [],
+    "//conditions:default": [
+        "-Wno-implicit-fallthrough",
+        "-fexceptions",
+    ],
 })
 
 # Public flatc library to compile flatbuffer files at runtime.
@@ -121,6 +126,7 @@
         ":freebsd": [
             "-lm",
         ],
+        ":windows": [],
         "//conditions:default": [
             "-lm",
             "-ldl",
diff --git a/third_party/systemlibs/flatbuffers.BUILD b/third_party/flatbuffers/BUILD.system
similarity index 100%
rename from third_party/systemlibs/flatbuffers.BUILD
rename to third_party/flatbuffers/BUILD.system
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index ae8d7fe..ba763f3 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -8,66 +8,50 @@
     "--gen-object-api",
 ]
 
-def flatbuffer_library_public(name,
-                              srcs,
-                              outs,
-                              language_flag,
-                              out_prefix="",
-                              includes=[],
-                              include_paths=[],
-                              flatc_args=DEFAULT_FLATC_ARGS,
-                              reflection_name="",
-                              reflection_visiblity=None,
-                              output_to_bindir=False):
-  '''Generates code files for reading/writing the given flatbuffers in the requested language using the public compiler.
+def flatbuffer_library_public(
+        name,
+        srcs,
+        outs,
+        language_flag,
+        out_prefix = "",
+        includes = [],
+        include_paths = [],
+        flatc_args = DEFAULT_FLATC_ARGS,
+        reflection_name = "",
+        reflection_visiblity = None,
+        output_to_bindir = False):
+    """Generates code files for reading/writing the given flatbuffers in the requested language using the public compiler.
 
-  Args:
-    name: Rule name.
-    srcs: Source .fbs files. Sent in order to the compiler.
-    outs: Output files from flatc.
-    language_flag: Target language flag. One of [-c, -j, -js].
-    out_prefix: Prepend this path to the front of all generated files except on
-        single source targets. Usually is a directory name.
-    includes: Optional, list of filegroups of schemas that the srcs depend on.
-    include_paths: Optional, list of paths the includes files can be found in.
-    flatc_args: Optional, list of additional arguments to pass to flatc.
-    reflection_name: Optional, if set this will generate the flatbuffer
-      reflection binaries for the schemas.
-    reflection_visiblity: The visibility of the generated reflection Fileset.
-    output_to_bindir: Passed to genrule for output to bin directory.
-  Outs:
-    filegroup(name): all generated source files.
-    Fileset([reflection_name]): (Optional) all generated reflection binaries.
-  '''
-  include_paths_cmd = ["-I %s" % (s) for s in include_paths]
-  # '$(@D)' when given a single source target will give the appropriate
-  # directory. Appending 'out_prefix' is only necessary when given a build
-  # target with multiple sources.
-  output_directory = (
-      ("-o $(@D)/%s" % (out_prefix)) if len(srcs) > 1 else ("-o $(@D)"))
-  genrule_cmd = " ".join([
-      "for f in $(SRCS); do",
-      "$(location %s)" % (flatc_path),
-      " ".join(flatc_args),
-      " ".join(include_paths_cmd),
-      language_flag,
-      output_directory,
-      "$$f;",
-      "done",
-  ])
-  native.genrule(
-      name=name,
-      srcs=srcs,
-      outs=outs,
-      output_to_bindir=output_to_bindir,
-      tools=includes + [flatc_path,],
-      cmd=genrule_cmd,
-      message="Generating flatbuffer files for %s:" % (name),)
-  if reflection_name:
-    reflection_genrule_cmd = " ".join([
+    Outs:
+      filegroup(name): all generated source files.
+      Fileset([reflection_name]): (Optional) all generated reflection binaries.
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      outs: Output files from flatc.
+      language_flag: Target language flag. One of [-c, -j, -js].
+      out_prefix: Prepend this path to the front of all generated files except on
+          single source targets. Usually is a directory name.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional, list of additional arguments to pass to flatc.
+      reflection_name: Optional, if set this will generate the flatbuffer
+        reflection binaries for the schemas.
+      reflection_visiblity: The visibility of the generated reflection Fileset.
+      output_to_bindir: Passed to genrule for output to bin directory.
+    """
+    include_paths_cmd = ["-I %s" % (s) for s in include_paths]
+
+    # '$(@D)' when given a single source target will give the appropriate
+    # directory. Appending 'out_prefix' is only necessary when given a build
+    # target with multiple sources.
+    output_directory = (
+        ("-o $(@D)/%s" % (out_prefix)) if len(srcs) > 1 else ("-o $(@D)")
+    )
+    genrule_cmd = " ".join([
         "for f in $(SRCS); do",
         "$(location %s)" % (flatc_path),
-        "-b --schema",
         " ".join(flatc_args),
         " ".join(include_paths_cmd),
         language_flag,
@@ -75,122 +59,157 @@
         "$$f;",
         "done",
     ])
-    reflection_outs = [
-        (out_prefix + "%s.bfbs") % (s.replace(".fbs", "").split("/")[-1]) for s in srcs
-    ]
     native.genrule(
-        name= "%s_srcs" % reflection_name,
-        srcs=srcs,
-        outs=reflection_outs,
-        output_to_bindir=output_to_bindir,
-        tools=includes + [flatc_path,],
-        cmd=reflection_genrule_cmd,
-        message="Generating flatbuffer reflection binary for %s:" % (name),)
-    native.Fileset(
-        name=reflection_name,
-        out="%s_out" % reflection_name,
-        entries=[
-            native.FilesetEntry(files=reflection_outs),
+        name = name,
+        srcs = srcs,
+        outs = outs,
+        output_to_bindir = output_to_bindir,
+        tools = includes + [flatc_path],
+        cmd = genrule_cmd,
+        message = "Generating flatbuffer files for %s:" % (name),
+    )
+    if reflection_name:
+        reflection_genrule_cmd = " ".join([
+            "for f in $(SRCS); do",
+            "$(location %s)" % (flatc_path),
+            "-b --schema",
+            " ".join(flatc_args),
+            " ".join(include_paths_cmd),
+            language_flag,
+            output_directory,
+            "$$f;",
+            "done",
+        ])
+        reflection_outs = [
+            (out_prefix + "%s.bfbs") % (s.replace(".fbs", "").split("/")[-1])
+            for s in srcs
+        ]
+        native.genrule(
+            name = "%s_srcs" % reflection_name,
+            srcs = srcs,
+            outs = reflection_outs,
+            output_to_bindir = output_to_bindir,
+            tools = includes + [flatc_path],
+            cmd = reflection_genrule_cmd,
+            message = "Generating flatbuffer reflection binary for %s:" % (name),
+        )
+        native.Fileset(
+            name = reflection_name,
+            out = "%s_out" % reflection_name,
+            entries = [
+                native.FilesetEntry(files = reflection_outs),
+            ],
+            visibility = reflection_visiblity,
+        )
+
+def flatbuffer_cc_library(
+        name,
+        srcs,
+        srcs_filegroup_name = "",
+        out_prefix = "",
+        includes = [],
+        include_paths = [],
+        flatc_args = DEFAULT_FLATC_ARGS,
+        visibility = None,
+        srcs_filegroup_visibility = None,
+        gen_reflections = False):
+    '''A cc_library with the generated reader/writers for the given flatbuffer definitions.
+
+    Outs:
+      filegroup([name]_srcs): all generated .h files.
+      filegroup(srcs_filegroup_name if specified, or [name]_includes if not):
+          Other flatbuffer_cc_library's can pass this in for their `includes`
+          parameter, if they depend on the schemas in this library.
+      Fileset([name]_reflection): (Optional) all generated reflection binaries.
+      cc_library([name]): library with sources and flatbuffers deps.
+
+    Remarks:
+      ** Because the genrule used to call flatc does not have any trivial way of
+        computing the output list of files transitively generated by includes and
+        --gen-includes (the default) being defined for flatc, the --gen-includes
+        flag will not work as expected. The way around this is to add a dependency
+        to the flatbuffer_cc_library defined alongside the flatc included Fileset.
+        For example you might define:
+
+        flatbuffer_cc_library(
+            name = "my_fbs",
+            srcs = [ "schemas/foo.fbs" ],
+            includes = [ "//third_party/bazz:bazz_fbs_includes" ],
+        )
+
+        In which foo.fbs includes a few files from the Fileset defined at
+        //third_party/bazz:bazz_fbs_includes. When compiling the library that
+        includes foo_generated.h, and therefore has my_fbs as a dependency, it
+        will fail to find any of the bazz *_generated.h files unless you also
+        add bazz's flatbuffer_cc_library to your own dependency list, e.g.:
+
+        cc_library(
+            name = "my_lib",
+            deps = [
+                ":my_fbs",
+                "//third_party/bazz:bazz_fbs"
+            ],
+        )
+
+        Happy dependent Flatbuffering!
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      srcs_filegroup_name: Name of the output filegroup that holds srcs. Pass this
+          filegroup into the `includes` parameter of any other
+          flatbuffer_cc_library that depends on this one's schemas.
+      out_prefix: Prepend this path to the front of all generated files. Usually
+          is a directory name.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+          ** SEE REMARKS BELOW **
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional list of additional arguments to pass to flatc
+          (e.g. --gen-mutable).
+      visibility: The visibility of the generated cc_library. By default, use the
+          default visibility of the project.
+      srcs_filegroup_visibility: The visibility of the generated srcs filegroup.
+          By default, use the value of the visibility parameter above.
+      gen_reflections: Optional, if true this will generate the flatbuffer
+        reflection binaries for the schemas.
+    '''
+    output_headers = [
+        (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1])
+        for s in srcs
+    ]
+    reflection_name = "%s_reflection" % name if gen_reflections else ""
+
+    flatbuffer_library_public(
+        name = "%s_srcs" % (name),
+        srcs = srcs,
+        outs = output_headers,
+        language_flag = "-c",
+        out_prefix = out_prefix,
+        includes = includes,
+        include_paths = include_paths,
+        flatc_args = flatc_args,
+        reflection_name = reflection_name,
+        reflection_visiblity = visibility,
+    )
+    native.cc_library(
+        name = name,
+        hdrs = output_headers,
+        srcs = output_headers,
+        features = [
+            "-parse_headers",
         ],
-        visibility=reflection_visiblity
+        deps = [
+            "@flatbuffers//:runtime_cc",
+        ],
+        includes = ["."],
+        linkstatic = 1,
+        visibility = visibility,
     )
 
-
-def flatbuffer_cc_library(name, srcs, srcs_filegroup_name="",
-                          out_prefix="", includes=[], include_paths=[],
-                          flatc_args=DEFAULT_FLATC_ARGS,
-                          visibility=None, srcs_filegroup_visibility=None,
-                          gen_reflections=False):
-  '''A cc_library with the generated reader/writers for the given flatbuffer definitions.
-
-  Args:
-    name: Rule name.
-    srcs: Source .fbs files. Sent in order to the compiler.
-    srcs_filegroup_name: Name of the output filegroup that holds srcs. Pass this
-        filegroup into the `includes` parameter of any other
-        flatbuffer_cc_library that depends on this one's schemas.
-    out_prefix: Prepend this path to the front of all generated files. Usually
-        is a directory name.
-    includes: Optional, list of filegroups of schemas that the srcs depend on.
-        ** SEE REMARKS BELOW **
-    include_paths: Optional, list of paths the includes files can be found in.
-    flatc_args: Optional list of additional arguments to pass to flatc
-        (e.g. --gen-mutable).
-    visibility: The visibility of the generated cc_library. By default, use the
-        default visibility of the project.
-    srcs_filegroup_visibility: The visibility of the generated srcs filegroup.
-        By default, use the value of the visibility parameter above.
-    gen_reflections: Optional, if true this will generate the flatbuffer
-      reflection binaries for the schemas.
-  Outs:
-    filegroup([name]_srcs): all generated .h files.
-    filegroup(srcs_filegroup_name if specified, or [name]_includes if not):
-        Other flatbuffer_cc_library's can pass this in for their `includes`
-        parameter, if they depend on the schemas in this library.
-    Fileset([name]_reflection): (Optional) all generated reflection binaries.
-    cc_library([name]): library with sources and flatbuffers deps.
-
-  Remarks:
-    ** Because the genrule used to call flatc does not have any trivial way of
-      computing the output list of files transitively generated by includes and
-      --gen-includes (the default) being defined for flatc, the --gen-includes
-      flag will not work as expected. The way around this is to add a dependency
-      to the flatbuffer_cc_library defined alongside the flatc included Fileset.
-      For example you might define:
-
-      flatbuffer_cc_library(
-          name = "my_fbs",
-          srcs = [ "schemas/foo.fbs" ],
-          includes = [ "//third_party/bazz:bazz_fbs_includes" ],
-      )
-
-      In which foo.fbs includes a few files from the Fileset defined at
-      //third_party/bazz:bazz_fbs_includes. When compiling the library that
-      includes foo_generated.h, and therefore has my_fbs as a dependency, it
-      will fail to find any of the bazz *_generated.h files unless you also
-      add bazz's flatbuffer_cc_library to your own dependency list, e.g.:
-
-      cc_library(
-          name = "my_lib",
-          deps = [
-              ":my_fbs",
-              "//third_party/bazz:bazz_fbs"
-          ],
-      )
-
-      Happy dependent Flatbuffering!
-  '''
-  output_headers = [
-      (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1]) for s in srcs
-  ]
-  reflection_name = "%s_reflection" % name if gen_reflections else ""
-
-  flatbuffer_library_public(name="%s_srcs" % (name),
-                            srcs=srcs,
-                            outs=output_headers,
-                            language_flag="-c",
-                            out_prefix=out_prefix,
-                            includes=includes,
-                            include_paths=include_paths,
-                            flatc_args=flatc_args,
-                            reflection_name=reflection_name,
-                            reflection_visiblity=visibility,)
-  native.cc_library(name=name,
-                    hdrs=output_headers,
-                    srcs=output_headers,
-                    features=[
-                        "-parse_headers",
-                    ],
-                    deps=[
-                        "@flatbuffers//:runtime_cc",
-                    ],
-                    includes=["."],
-                    linkstatic=1,
-                    visibility=visibility)
-
-  # A filegroup for the `srcs`. That is, all the schema files for this
-  # Flatbuffer set.
-  native.filegroup(
-      name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
-      srcs = srcs,
-      visibility=srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility)
+    # A filegroup for the `srcs`. That is, all the schema files for this
+    # Flatbuffer set.
+    native.filegroup(
+        name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
+        srcs = srcs,
+        visibility = srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility,
+    )
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
new file mode 100644
index 0000000..3aeef96
--- /dev/null
+++ b/third_party/flatbuffers/workspace.bzl
@@ -0,0 +1,19 @@
+"""Loads the Flatbuffers library, used by TF Lite."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "flatbuffers",
+        strip_prefix = "flatbuffers-1.9.0",
+        sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3",
+        urls = [
+            "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
+            "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
+        ],
+        build_file = "//third_party/flatbuffers:BUILD.bazel",
+        system_build_file = "//third_party/flatbuffers:BUILD.system",
+        link_files = {
+            "//third_party/flatbuffers:build_defs.bzl": "build_defs.bzl",
+        },
+    )
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index 78fbd6c..cbe730f 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -21,7 +21,6 @@
     ],
     hdrs = ["lib/gif_lib.h"],
     defines = select({
-        #"@org_tensorflow//tensorflow:android": [
         ":android": [
             "S_IREAD=S_IRUSR",
             "S_IWRITE=S_IWUSR",
@@ -33,7 +32,6 @@
     visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":windows_polyfill"],
-        ":windows_msvc": [":windows_polyfill"],
         "//conditions:default": [],
     }),
 )
@@ -51,13 +49,6 @@
 )
 
 config_setting(
-    name = "windows_msvc",
-    values = {
-        "cpu": "x64_windows_msvc",
-    },
-)
-
-config_setting(
     name = "windows",
     values = {
         "cpu": "x64_windows",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index e848fa1..f6a39ae 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -61,6 +61,7 @@
 CUPTI_HEADER_PATHS = [
     "extras/CUPTI/include/",
     "include/cuda/CUPTI/",
+    "include/",
 ]
 
 # Lookup paths for the cupti library, relative to the
@@ -69,7 +70,7 @@
 # the other CUDA libraries but rather in a special extras/CUPTI directory.
 CUPTI_LIB_PATHS = [
     "extras/CUPTI/lib64/",
-    "lib/x86_64-linux-gnu",
+    "lib/x86_64-linux-gnu/",
     "lib64/",
     "extras/CUPTI/libx64/",
     "extras/CUPTI/lib/",
@@ -96,6 +97,7 @@
 NVVM_LIBDEVICE_PATHS = [
     "nvvm/libdevice/",
     "share/cuda/",
+    "lib/nvidia-cuda-toolkit/libdevice/",
 ]
 
 # Files used to detect the NVVM libdevice path.
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 663a218..96e7ac0 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -22,7 +22,6 @@
         "-w",
     ],
     ":windows": WIN_COPTS,
-    ":windows_msvc": WIN_COPTS,
     "//conditions:default": [
         "-O3",
         "-w",
@@ -272,8 +271,10 @@
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
@@ -423,7 +424,6 @@
     outs = ["jconfig.h"],
     cmd = select({
         ":windows": "cp $(location jconfig_win.h) $@",
-        ":windows_msvc": "cp $(location jconfig_win.h) $@",
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
@@ -441,7 +441,6 @@
     outs = ["jconfigint.h"],
     cmd = select({
         ":windows": "cp $(location jconfigint_win.h) $@",
-        ":windows_msvc": "cp $(location jconfigint_win.h) $@",
         "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
     }),
 )
@@ -542,11 +541,6 @@
 )
 
 config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
     name = "linux_ppc64le",
     values = {"cpu": "ppc"},
 )
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
index 75792b0..3c50b8c 100644
--- a/third_party/kafka/BUILD
+++ b/third_party/kafka/BUILD
@@ -15,6 +15,7 @@
         "src-cpp/KafkaConsumerImpl.cpp",
         "src-cpp/MessageImpl.cpp",
         "src-cpp/MetadataImpl.cpp",
+        "src-cpp/ProducerImpl.cpp",
         "src-cpp/QueueImpl.cpp",
         "src-cpp/RdKafka.cpp",
         "src-cpp/TopicImpl.cpp",
@@ -130,7 +131,15 @@
         "src/tinycthread.h",
         "src/xxhash.c",
         "src/xxhash.h",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:windows": [
+            "src/rdkafka_sasl_win32.c",
+            "src/rdwin32.h",
+            "src/regexp.c",
+            "src/regexp.h",
+        ],
+        "//conditions:default": [],
+    }),
     hdrs = [
         "config.h",
         "src-cpp/rdkafkacpp.h",
@@ -138,15 +147,25 @@
         "src/lz4.c",
         "src/snappy_compat.h",
     ],
-    copts = [
-        "-Iexternal/kafka/src",
-        "-Iexternal/kafka/src-cpp",
+    copts = select({
+        "@org_tensorflow//tensorflow:windows": [
+            "-DWIN32_LEAN_AND_MEAN",
+            "-DWITHOUT_WIN32_CONFIG",
+            "-DWITH_ZLIB=1",
+            "-DWITH_SSL=1",
+            "-DWITH_SNAPPY=1",
+        ],
+        "//conditions:default": [],
+    }),
+    defines = ["LIBRDKAFKA_STATICLIB"],
+    includes = [
+        "src",
+        "src-cpp",
     ],
-    defines = [
-    ],
-    linkopts = [
-        "-lpthread",
-    ],
+    linkopts = select({
+        "@org_tensorflow//tensorflow:windows": ["-defaultlib:crypt32.lib"],
+        "//conditions:default": ["-lpthread"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         "@boringssl//:ssl",
diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD
index 9b3e1d9..f36a698 100644
--- a/third_party/lmdb.BUILD
+++ b/third_party/lmdb.BUILD
@@ -20,7 +20,6 @@
     ],
     linkopts = select({
         ":windows": ["-DEFAULTLIB:advapi32.lib"],  # InitializeSecurityDescriptor, SetSecurityDescriptorDacl
-        ":windows_msvc": ["-DEFAULTLIB:advapi32.lib"],
         "//conditions:default": ["-lpthread"],
     }),
     visibility = ["//visibility:public"],
@@ -30,8 +29,3 @@
     name = "windows",
     values = {"cpu": "x64_windows"},
 )
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-)
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index a058c46..efff7fd 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -2,17 +2,28 @@
 
 config_setting(
     name = "using_mkl",
-    values = {
-        "define": "using_mkl=true",
+    define_values = {
+        "using_mkl": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "using_mkl_ml_only",
+    define_values = {
+        "using_mkl": "true",
+        "using_mkl_ml_only": "true",
     },
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "using_mkl_lnx_x64",
+    define_values = {
+        "using_mkl": "true",
+    },
     values = {
         "cpu": "k8",
-        "define": "using_mkl=true",
     },
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 53e0276..06a8c35 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -1,6 +1,9 @@
 # -*- Python -*-
 """Skylark macros for MKL.
 if_mkl is a conditional to check if MKL is enabled or not.
+if_mkl_ml is a conditional to check if MKL-ML is enabled.
+if_mkl_ml_only is a conditional to check for MKL-ML-only (no MKL-DNN) mode.
+if_mkl_lnx_x64 is a conditional to check for MKL
 
 mkl_repository is a repository rule for creating MKL repository rule that can
 be pointed to either a local folder, or download it from the internet.
@@ -15,27 +18,89 @@
 def if_mkl(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with MKL.
 
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL enabled.  Otherwise, the select statement evaluates to if_false.
+    Args:
+      if_true: expression to evaluate if building with MKL.
+      if_false: expression to evaluate if building without MKL.
 
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        str(Label("//third_party/mkl:using_mkl")): if_true,
-        "//conditions:default": if_false
+        "//third_party/mkl:using_mkl": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_mkl_ml(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with MKL-ML.
+
+    Args:
+      if_true: expression to evaluate if building with MKL-ML.
+      if_false: expression to evaluate if building without MKL-ML
+        (i.e. without MKL at all, or with MKL-DNN only).
+
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        "//third_party/mkl_dnn:using_mkl_dnn_only":
+        if_false,
+        "//third_party/mkl:using_mkl": if_true,
+        "//conditions:default": if_false,
+        })
+
+def if_mkl_ml_only(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with MKL-ML only.
+
+    Args:
+      if_true: expression to evaluate if building with MKL-ML only.
+      if_false: expression to evaluate if building without MKL, or with MKL-DNN.
+
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        "//third_party/mkl:using_mkl_ml_only": if_true,
+        "//conditions:default": if_false,
     })
 
 def if_mkl_lnx_x64(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with MKL.
+    """Shorthand to select() on if MKL is on and the target is Linux x86-64.
 
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL enabled.  Otherwise, the select statement evaluates to if_false.
+    Args:
+      if_true: expression to evaluate if building with MKL is enabled and the
+        target platform is Linux x86-64.
+      if_false: expression to evaluate if building without MKL or for a
+        different platform.
 
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        str(Label("//third_party/mkl:using_mkl_lnx_x64")): if_true,
-        "//conditions:default": if_false
+        "//third_party/mkl:using_mkl_lnx_x64": if_true,
+        "//conditions:default": if_false,
     })
 
+def mkl_deps():
+    """Shorthand for select() to pull in the correct set of MKL library deps.
+
+    Can pull in MKL-ML, MKL-DNN, both, or neither depending on config settings.
+
+    Returns:
+      a select evaluating to a list of library dependencies, suitable for
+      inclusion in the deps attribute of rules.
+    """
+    return select({
+        "//third_party/mkl_dnn:using_mkl_dnn_only":
+        ["@mkl_dnn"],
+        "//third_party/mkl:using_mkl_ml_only":
+        ["//third_party/mkl:intel_binary_blob"],
+        "//third_party/mkl:using_mkl":
+        [
+            "//third_party/mkl:intel_binary_blob",
+            "@mkl_dnn"
+        ],
+        "//conditions:default": []
+        })
 
 def _enable_local_mkl(repository_ctx):
   return _TF_MKL_ROOT in repository_ctx.os.environ
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index d075809..3e567fa 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -4,8 +4,9 @@
 
 config_setting(
     name = "using_mkl_dnn_only",
-    values = {
-        "define": "using_mkl_dnn_only=true",
+    define_values = {
+        "using_mkl": "true",
+        "using_mkl_dnn_only": "true",
     },
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/nasm.BUILD b/third_party/nasm.BUILD
index 89330ea..2b87788 100644
--- a/third_party/nasm.BUILD
+++ b/third_party/nasm.BUILD
@@ -142,7 +142,6 @@
     ],
     copts = select({
         ":windows": [],
-        ":windows_msvc": [],
         "//conditions:default": [
             "-w",
             "-std=c99",
@@ -150,7 +149,6 @@
     }),
     defines = select({
         ":windows": [],
-        ":windows_msvc": [],
         "//conditions:default": [
             "HAVE_SNPRINTF",
             "HAVE_SYS_TYPES_H",
@@ -160,13 +158,6 @@
 )
 
 config_setting(
-    name = "windows_msvc",
-    values = {
-        "cpu": "x64_windows_msvc",
-    },
-)
-
-config_setting(
     name = "windows",
     values = {
         "cpu": "x64_windows",
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 17c5449..c26a289 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -29,6 +29,10 @@
         "pngwtran.c",
         "pngwutil.c",
     ] + select({
+        ":windows": [
+            "intel/intel_init.c",
+            "intel/filter_sse2_intrinsics.c",
+        ],
         "@org_tensorflow//tensorflow:linux_ppc64le": [
             "powerpc/powerpc_init.c",
             "powerpc/filter_vsx_intrinsics.c",
@@ -41,7 +45,14 @@
         "pngconf.h",
     ],
     includes = ["."],
-    linkopts = ["-lm"],
+    copts = select({
+        ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
+        "//conditions:default": [],
+    }),
+    linkopts = select({
+        ":windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     visibility = ["//visibility:public"],
     deps = ["@zlib_archive//:zlib"],
 )
@@ -52,3 +63,8 @@
     outs = ["pnglibconf.h"],
     cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12b0/' $< >$@",
 )
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 5cb4269..7d1aa5d 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -19,104 +19,111 @@
 ])
 
 def _is_windows(ctx):
-  return ctx.os.name.lower().find("windows") != -1
+    return ctx.os.name.lower().find("windows") != -1
 
 def _wrap_bash_cmd(ctx, cmd):
-  if _is_windows(ctx):
-    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
-    if not bazel_sh:
-      fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
-  return cmd
+    if _is_windows(ctx):
+        bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+        if not bazel_sh:
+            fail("BAZEL_SH environment variable is not set")
+        cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
+    return cmd
 
 def _get_env_var(ctx, name):
-  if name in ctx.os.environ:
-    return ctx.os.environ[name]
-  else:
-    return None
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
 
 # Checks if we should use the system lib instead of the bundled one
 def _use_system_lib(ctx, name):
-  syslibenv = _get_env_var(ctx, "TF_SYSTEM_LIBS")
-  if syslibenv:
-    for n in syslibenv.strip().split(","):
-      if n.strip() == name:
-        return True
-  return False
+    syslibenv = _get_env_var(ctx, "TF_SYSTEM_LIBS")
+    if syslibenv:
+        for n in syslibenv.strip().split(","):
+            if n.strip() == name:
+                return True
+    return False
 
 # Executes specified command with arguments and calls 'fail' if it exited with
 # non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args, timeout=10)
-  if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
-          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
-                                  result.stdout, result.stderr))
+    result = repo_ctx.execute(cmd_and_args, timeout = 10)
+    if result.return_code != 0:
+        fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n" +
+              "Stderr: {3}").format(
+            " ".join(cmd_and_args),
+            result.return_code,
+            result.stdout,
+            result.stderr,
+        ))
 
 def _repos_are_siblings():
-  return Label("@foo//bar").workspace_root.startswith("../")
+    return Label("@foo//bar").workspace_root.startswith("../")
 
 # Apply a patch_file to the repository root directory
 # Runs 'patch -p1'
 def _apply_patch(ctx, patch_file):
-  # Don't check patch on Windows, because patch is only available under bash.
-  if not _is_windows(ctx) and not ctx.which("patch"):
-    fail("patch command is not found, please install it")
-  cmd = _wrap_bash_cmd(
-    ctx, ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)])
-  _execute_and_check_ret_code(ctx, cmd)
+    # Don't check patch on Windows, because patch is only available under bash.
+    if not _is_windows(ctx) and not ctx.which("patch"):
+        fail("patch command is not found, please install it")
+    cmd = _wrap_bash_cmd(
+        ctx,
+        ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)],
+    )
+    _execute_and_check_ret_code(ctx, cmd)
 
 def _apply_delete(ctx, paths):
-  for path in paths:
-    if path.startswith("/"):
-      fail("refusing to rm -rf path starting with '/': " + path)
-    if ".." in path:
-      fail("refusing to rm -rf path containing '..': " + path)
-  cmd = _wrap_bash_cmd(ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
-  _execute_and_check_ret_code(ctx, cmd)
+    for path in paths:
+        if path.startswith("/"):
+            fail("refusing to rm -rf path starting with '/': " + path)
+        if ".." in path:
+            fail("refusing to rm -rf path containing '..': " + path)
+    cmd = _wrap_bash_cmd(ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+    _execute_and_check_ret_code(ctx, cmd)
 
 def _tf_http_archive(ctx):
-  if ("mirror.bazel.build" not in ctx.attr.urls[0] and
-      (len(ctx.attr.urls) < 2 and
-       ctx.attr.name not in _SINGLE_URL_WHITELIST)):
-    fail("tf_http_archive(urls) must have redundant URLs. The " +
-         "mirror.bazel.build URL must be present and it must come first. " +
-         "Even if you don't have permission to mirror the file, please " +
-         "put the correctly formatted mirror URL there anyway, because " +
-         "someone will come along shortly thereafter and mirror the file.")
+    if ("mirror.bazel.build" not in ctx.attr.urls[0] and
+        (len(ctx.attr.urls) < 2 and
+         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+        fail("tf_http_archive(urls) must have redundant URLs. The " +
+             "mirror.bazel.build URL must be present and it must come first. " +
+             "Even if you don't have permission to mirror the file, please " +
+             "put the correctly formatted mirror URL there anyway, because " +
+             "someone will come along shortly thereafter and mirror the file.")
 
-  use_syslib = _use_system_lib(ctx, ctx.attr.name)
-  if not use_syslib:
-    ctx.download_and_extract(
-        ctx.attr.urls,
-        "",
-        ctx.attr.sha256,
-        ctx.attr.type,
-        ctx.attr.strip_prefix)
-    if ctx.attr.delete:
-      _apply_delete(ctx, ctx.attr.delete)
-    if ctx.attr.patch_file != None:
-      _apply_patch(ctx, ctx.attr.patch_file)
+    use_syslib = _use_system_lib(ctx, ctx.attr.name)
+    if not use_syslib:
+        ctx.download_and_extract(
+            ctx.attr.urls,
+            "",
+            ctx.attr.sha256,
+            ctx.attr.type,
+            ctx.attr.strip_prefix,
+        )
+        if ctx.attr.delete:
+            _apply_delete(ctx, ctx.attr.delete)
+        if ctx.attr.patch_file != None:
+            _apply_patch(ctx, ctx.attr.patch_file)
 
-  if use_syslib and ctx.attr.system_build_file != None:
-    # Use BUILD.bazel to avoid conflict with third party projects with
-    # BUILD or build (directory) underneath.
-    ctx.template("BUILD.bazel", ctx.attr.system_build_file, {
-        "%prefix%": ".." if _repos_are_siblings() else "external",
-    }, False)
+    if use_syslib and ctx.attr.system_build_file != None:
+        # Use BUILD.bazel to avoid conflict with third party projects with
+        # BUILD or build (directory) underneath.
+        ctx.template("BUILD.bazel", ctx.attr.system_build_file, {
+            "%prefix%": ".." if _repos_are_siblings() else "external",
+        }, False)
 
-  elif ctx.attr.build_file != None:
-    # Use BUILD.bazel to avoid conflict with third party projects with
-    # BUILD or build (directory) underneath.
-    ctx.template("BUILD.bazel", ctx.attr.build_file, {
-        "%prefix%": ".." if _repos_are_siblings() else "external",
-    }, False)
+    elif ctx.attr.build_file != None:
+        # Use BUILD.bazel to avoid conflict with third party projects with
+        # BUILD or build (directory) underneath.
+        ctx.template("BUILD.bazel", ctx.attr.build_file, {
+            "%prefix%": ".." if _repos_are_siblings() else "external",
+        }, False)
 
 tf_http_archive = repository_rule(
-    implementation=_tf_http_archive,
-    attrs={
-        "sha256": attr.string(mandatory=True),
-        "urls": attr.string_list(mandatory=True, allow_empty=False),
+    implementation = _tf_http_archive,
+    attrs = {
+        "sha256": attr.string(mandatory = True),
+        "urls": attr.string_list(mandatory = True, allow_empty = False),
         "strip_prefix": attr.string(),
         "type": attr.string(),
         "delete": attr.string_list(),
@@ -124,12 +131,78 @@
         "build_file": attr.label(),
         "system_build_file": attr.label(),
     },
-    environ=[
-	"TF_SYSTEM_LIBS",
-    ])
+    environ = [
+        "TF_SYSTEM_LIBS",
+    ],
+)
 """Downloads and creates Bazel repos for dependencies.
 
 This is a swappable replacement for both http_archive() and
 new_http_archive() that offers some additional features. It also helps
 ensure best practices are followed.
 """
+
+def _third_party_http_archive(ctx):
+    if ("mirror.bazel.build" not in ctx.attr.urls[0] and
+        (len(ctx.attr.urls) < 2 and
+         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+        fail("tf_http_archive(urls) must have redundant URLs. The " +
+             "mirror.bazel.build URL must be present and it must come first. " +
+             "Even if you don't have permission to mirror the file, please " +
+             "put the correctly formatted mirror URL there anyway, because " +
+             "someone will come along shortly thereafter and mirror the file.")
+
+    use_syslib = _use_system_lib(ctx, ctx.attr.name)
+
+    # Use "BUILD.bazel" to avoid conflict with third party projects that contain a
+    # file or directory called "BUILD"
+    buildfile_path = ctx.path("BUILD.bazel")
+
+    if use_syslib:
+        if ctx.attr.system_build_file == None:
+            fail("Bazel was configured with TF_SYSTEM_LIBS to use a system " +
+                 "library for %s, but no system build file for %s was configured. " +
+                 "Please add a system_build_file attribute to the repository rule" +
+                 "for %s." % (ctx.attr.name, ctx.attr.name, ctx.attr.name))
+        ctx.symlink(Label(ctx.attr.system_build_file), buildfile_path)
+
+    else:
+        ctx.download_and_extract(
+            ctx.attr.urls,
+            "",
+            ctx.attr.sha256,
+            ctx.attr.type,
+            ctx.attr.strip_prefix,
+        )
+        if ctx.attr.delete:
+            _apply_delete(ctx, ctx.attr.delete)
+        if ctx.attr.patch_file != None:
+            _apply_patch(ctx, ctx.attr.patch_file)
+        ctx.symlink(Label(ctx.attr.build_file), buildfile_path)
+
+    for internal_src, external_dest in ctx.attr.link_files.items():
+        ctx.symlink(Label(internal_src), ctx.path(external_dest))
+
+# Downloads and creates Bazel repos for dependencies.
+#
+# This is an upgrade for tf_http_archive that works with go/tfbr-thirdparty.
+#
+# For link_files, specify each dict entry as:
+# "//path/to/source:file": "localfile"
+third_party_http_archive = repository_rule(
+    implementation = _third_party_http_archive,
+    attrs = {
+        "sha256": attr.string(mandatory = True),
+        "urls": attr.string_list(mandatory = True, allow_empty = False),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "build_file": attr.string(mandatory = True),
+        "system_build_file": attr.string(mandatory = False),
+        "patch_file": attr.label(),
+        "link_files": attr.string_dict(),
+    },
+    environ = [
+        "TF_SYSTEM_LIBS",
+    ],
+)
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index cc11f52..d93f030 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -18,17 +18,9 @@
         "snappy-stubs-public.h",
     ],
     hdrs = ["snappy.h"],
-    copts = select({
-        "@org_tensorflow//tensorflow:windows": [
-            "/DHAVE_CONFIG_H",
-            "/EHsc",
-        ],
-        "@org_tensorflow//tensorflow:windows_msvc": [
-            "/DHAVE_CONFIG_H",
-            "/EHsc",
-        ],
+    copts = ["-DHAVE_CONFIG_H"] + select({
+        "@org_tensorflow//tensorflow:windows": [],
         "//conditions:default": [
-            "-DHAVE_CONFIG_H",
             "-fno-exceptions",
             "-Wno-sign-compare",
             "-Wno-shift-negative-value",
diff --git a/third_party/sqlite.BUILD b/third_party/sqlite.BUILD
index 2876f30..8b876fb 100644
--- a/third_party/sqlite.BUILD
+++ b/third_party/sqlite.BUILD
@@ -4,7 +4,6 @@
 licenses(["unencumbered"])  # Public Domain
 
 SQLITE_COPTS = [
-    "-Os",
     "-DSQLITE_ENABLE_JSON1",
     "-DHAVE_DECL_STRERROR_R=1",
     "-DHAVE_STDINT_H=1",
@@ -15,15 +14,14 @@
     "@org_tensorflow//tensorflow:windows": [
         "-DSQLITE_MAX_TRIGGER_DEPTH=100",
     ],
-    "@org_tensorflow//tensorflow:windows_msvc": [
-        "-DSQLITE_MAX_TRIGGER_DEPTH=100",
-    ],
     "@org_tensorflow//tensorflow:darwin": [
+        "-Os",
         "-DHAVE_GMTIME_R=1",
         "-DHAVE_LOCALTIME_R=1",
         "-DHAVE_USLEEP=1",
     ],
     "//conditions:default": [
+        "-Os",
         "-DHAVE_FDATASYNC=1",
         "-DHAVE_GMTIME_R=1",
         "-DHAVE_LOCALTIME_R=1",
@@ -48,7 +46,7 @@
         "SQLITE_OMIT_DEPRECATED",
     ],
     linkopts = select({
-        "@org_tensorflow//tensorflow:windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [],
         "//conditions:default": [
             "-ldl",
             "-lpthread",
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
index f2f6474..59a3d9e 100644
--- a/third_party/swig.BUILD
+++ b/third_party/swig.BUILD
@@ -71,7 +71,6 @@
     ],
     copts = ["$(STACK_FRAME_UNLIMITED)"] + select({
         ":windows": [],
-        ":windows_msvc": [],
         "//conditions:default": [
             "-Wno-parentheses",
             "-Wno-unused-variable",
@@ -332,11 +331,6 @@
 )
 
 config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
     name = "windows",
     values = {"cpu": "x64_windows"},
 )
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index e8048dd..33694ea 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -34,7 +34,6 @@
     hdrs = ["zlib.h"],
     copts = select({
         "@org_tensorflow//tensorflow:windows": [],
-        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",